16#include <winpr/sysinfo.h>
18#include <freerdp/config.h>
21#include <freerdp/types.h>
22#include <freerdp/primitives.h>
23#include <freerdp/log.h>
25#include "prim_internal.h"
27#include "../codec/color.h"
29#include <freerdp/codec/color.h>
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
35static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
36 uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
38 return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
39 (int32_t)i5, (int32_t)i6, (int32_t)i7);
42static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
43 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
45 const BYTE* WINPR_RESTRICT pSrcData,
46 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
47 int64_t srcVMultiplier, int64_t srcVOffset,
48 int64_t dstVMultiplier, int64_t dstVOffset)
51 const int64_t srcByte = 3;
52 const int64_t dstByte = 4;
54 const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
55 0xFF000000, 0xFF000000, 0xFF000000);
56 const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
57 0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
58 const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
59 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
60 const UINT32 rem = nWidth % 8;
61 const int64_t width = nWidth - rem;
63 for (int64_t y = 0; y < nHeight; y++)
65 const BYTE* WINPR_RESTRICT srcLine =
66 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
67 BYTE* WINPR_RESTRICT dstLine =
68 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
73 for (; x < width; x += 8)
75 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
76 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
77 const __m256i s0 = _mm256_loadu_si256(src);
78 __m256i s1 = _mm256_shuffle_epi8(s0, smask);
82 const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
83 const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
84 const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
85 0x00000000, 0x00000000, 0x00000000, 0x00000000);
86 const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
88 const __m256i s2 = _mm256_loadu_si256(dst);
89 __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
90 _mm256_storeu_si256(dst, d0);
93 for (; x < nWidth; x++)
95 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
96 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
103 return PRIMITIVES_SUCCESS;
106static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
107 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
108 UINT32 nWidth, UINT32 nHeight,
109 const BYTE* WINPR_RESTRICT pSrcData,
110 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
111 int64_t srcVMultiplier, int64_t srcVOffset,
112 int64_t dstVMultiplier, int64_t dstVOffset)
115 const int64_t srcByte = 4;
116 const int64_t dstByte = 4;
118 const __m256i mask = _mm256_setr_epi8(
119 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
120 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
121 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
122 (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
123 const UINT32 rem = nWidth % 8;
124 const int64_t width = nWidth - rem;
125 for (int64_t y = 0; y < nHeight; y++)
127 const BYTE* WINPR_RESTRICT srcLine =
128 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
129 BYTE* WINPR_RESTRICT dstLine =
130 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
133 for (; x < width; x += 8)
135 const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
136 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
137 const __m256i s0 = _mm256_loadu_si256(src);
138 const __m256i s1 = _mm256_loadu_si256(dst);
139 __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
140 _mm256_storeu_si256(dst, d0);
143 for (; x < nWidth; x++)
145 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
146 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
153 return PRIMITIVES_SUCCESS;
156static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
157 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
158 UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
159 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
160 UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
163 WINPR_ASSERT(pDstData);
164 WINPR_ASSERT(pSrcData);
168 case PIXEL_FORMAT_BGR24:
171 case PIXEL_FORMAT_BGRX32:
172 case PIXEL_FORMAT_BGRA32:
173 return avx2_image_copy_bgr24_bgrx32(
174 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
175 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
180 case PIXEL_FORMAT_BGRX32:
181 case PIXEL_FORMAT_BGRA32:
184 case PIXEL_FORMAT_BGRX32:
185 case PIXEL_FORMAT_BGRA32:
186 return avx2_image_copy_bgrx32_bgrx32(
187 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
188 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
193 case PIXEL_FORMAT_RGBX32:
194 case PIXEL_FORMAT_RGBA32:
197 case PIXEL_FORMAT_RGBX32:
198 case PIXEL_FORMAT_RGBA32:
199 return avx2_image_copy_bgrx32_bgrx32(
200 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
201 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
211 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
212 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
215static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
216 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
217 UINT32 nWidth, UINT32 nHeight,
218 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
219 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
220 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
222 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
223 int64_t srcVOffset = 0;
224 int64_t srcVMultiplier = 1;
225 int64_t dstVOffset = 0;
226 int64_t dstVMultiplier = 1;
228 if ((nWidth == 0) || (nHeight == 0))
229 return PRIMITIVES_SUCCESS;
231 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
234 if (!pDstData || !pSrcData)
238 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
241 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
245 srcVOffset = (nHeight - 1ll) * nSrcStep;
249 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
250 return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
251 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
252 nXSrc, nYSrc, palette, flags, srcVMultiplier,
253 srcVOffset, dstVMultiplier, dstVOffset);
254 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
255 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
256 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
257 nXSrc, nYSrc, palette, srcVMultiplier,
258 srcVOffset, dstVMultiplier, dstVOffset, flags);
262 return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
263 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
269void primitives_init_copy_avx2_int(
primitives_t* WINPR_RESTRICT prims)
271#if defined(SSE_AVX_INTRINSICS_ENABLED)
272 WLog_VRB(PRIM_TAG,
"AVX2 optimizations");
273 prims->copy_no_overlap = avx2_image_copy_no_overlap;
275 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
fn_copy_no_overlap_t copy_no_overlap