16#include <winpr/sysinfo.h>
18#include <freerdp/config.h>
21#include <freerdp/types.h>
22#include <freerdp/primitives.h>
23#include <freerdp/log.h>
25#include "prim_internal.h"
26#include "prim_avxsse.h"
28#include "../codec/color.h"
30#include <freerdp/codec/color.h>
32#if defined(SSE_AVX_INTRINSICS_ENABLED)
36static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
37 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
39 const BYTE* WINPR_RESTRICT pSrcData,
40 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
41 int64_t srcVMultiplier, int64_t srcVOffset,
42 int64_t dstVMultiplier, int64_t dstVOffset)
45 const int64_t srcByte = 3;
46 const int64_t dstByte = 4;
48 const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
49 const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
50 const UINT32 rem = nWidth % 4;
52 const int64_t width = nWidth - rem;
53 for (int64_t y = 0; y < nHeight; y++)
55 const BYTE* WINPR_RESTRICT srcLine =
56 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
57 BYTE* WINPR_RESTRICT dstLine =
58 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
62 for (; x < width; x += 4)
64 const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
65 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
66 const __m128i s0 = LOAD_SI128(src);
67 const __m128i s1 = _mm_shuffle_epi8(s0, smask);
68 const __m128i s2 = LOAD_SI128(dst);
70 __m128i d0 = _mm_blendv_epi8(s1, s2, mask);
74 for (; x < nWidth; x++)
76 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
77 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
84 return PRIMITIVES_SUCCESS;
87static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
88 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
90 const BYTE* WINPR_RESTRICT pSrcData,
91 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
92 int64_t srcVMultiplier, int64_t srcVOffset,
93 int64_t dstVMultiplier, int64_t dstVOffset)
96 const int64_t srcByte = 4;
97 const int64_t dstByte = 4;
99 const __m128i mask = _mm_setr_epi8((
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF,
100 (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF,
101 (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
102 const UINT32 rem = nWidth % 4;
103 const int64_t width = nWidth - rem;
104 for (int64_t y = 0; y < nHeight; y++)
106 const BYTE* WINPR_RESTRICT srcLine =
107 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
108 BYTE* WINPR_RESTRICT dstLine =
109 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
112 for (; x < width; x += 4)
114 const __m128i* src = (
const __m128i*)&srcLine[(x + nXSrc) * srcByte];
115 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
116 const __m128i s0 = LOAD_SI128(src);
117 const __m128i s1 = LOAD_SI128(dst);
118 __m128i d0 = _mm_blendv_epi8(s1, s0, mask);
119 STORE_SI128(dst, d0);
122 for (; x < nWidth; x++)
124 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
125 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
132 return PRIMITIVES_SUCCESS;
135static pstatus_t sse_image_copy_no_overlap_dst_alpha(
136 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
137 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
138 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
139 UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
142 WINPR_ASSERT(pDstData);
143 WINPR_ASSERT(pSrcData);
147 case PIXEL_FORMAT_BGR24:
150 case PIXEL_FORMAT_BGRX32:
151 case PIXEL_FORMAT_BGRA32:
152 return sse_image_copy_bgr24_bgrx32(
153 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
154 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
159 case PIXEL_FORMAT_BGRX32:
160 case PIXEL_FORMAT_BGRA32:
163 case PIXEL_FORMAT_BGRX32:
164 case PIXEL_FORMAT_BGRA32:
165 return sse_image_copy_bgrx32_bgrx32(
166 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
167 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
172 case PIXEL_FORMAT_RGBX32:
173 case PIXEL_FORMAT_RGBA32:
176 case PIXEL_FORMAT_RGBX32:
177 case PIXEL_FORMAT_RGBA32:
178 return sse_image_copy_bgrx32_bgrx32(
179 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
180 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
190 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
191 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
194static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
195 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
196 UINT32 nWidth, UINT32 nHeight,
197 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
198 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
199 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
201 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
202 int64_t srcVOffset = 0;
203 int64_t srcVMultiplier = 1;
204 int64_t dstVOffset = 0;
205 int64_t dstVMultiplier = 1;
207 if ((nWidth == 0) || (nHeight == 0))
208 return PRIMITIVES_SUCCESS;
210 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
213 if (!pDstData || !pSrcData)
217 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
220 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
224 srcVOffset = (nHeight - 1ll) * nSrcStep;
228 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
229 return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
230 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
231 nXSrc, nYSrc, palette, flags, srcVMultiplier,
232 srcVOffset, dstVMultiplier, dstVOffset);
233 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
234 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
235 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
236 nXSrc, nYSrc, palette, srcVMultiplier,
237 srcVOffset, dstVMultiplier, dstVOffset, flags);
241 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
242 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
248void primitives_init_copy_sse41_int(
primitives_t* WINPR_RESTRICT prims)
250#if defined(SSE_AVX_INTRINSICS_ENABLED)
251 WLog_VRB(PRIM_TAG,
"SSE4.1 optimizations");
252 prims->copy_no_overlap = sse_image_copy_no_overlap;
254 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE4.1 intrinsics not available");
fn_copy_no_overlap_t copy_no_overlap