20#include <freerdp/config.h>
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
26#include "prim_YCoCg.h"
28#include "prim_internal.h"
29#include "prim_templates.h"
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
38static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40 UINT32 dstStep, UINT32 width, UINT32 height,
41 UINT8 shift, BOOL withAlpha)
43 const BYTE* sptr = pSrc;
46 WINPR_ASSERT(srcStep /
sizeof(UINT32) >= width);
47 WINPR_ASSERT(dstStep /
sizeof(UINT32) >= width);
48 const size_t sRowBump = srcStep - width *
sizeof(UINT32);
49 const size_t dRowBump = dstStep - width *
sizeof(UINT32);
53 int dataShift = shift - 1;
54 BYTE mask = (BYTE)(0xFFU << dataShift);
65 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
68 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70 width, height, shift, withAlpha);
73 for (UINT32 h = 0; h < height; h++)
88 R0 = LOAD_SI128(sptr);
90 R1 = LOAD_SI128(sptr);
96 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
97 R3 = _mm_shuffle_epi8(R0, R2);
98 R4 = _mm_shuffle_epi8(R1, R2);
101 R5 = _mm_unpackhi_epi32(R3, R4);
102 R6 = _mm_unpacklo_epi32(R3, R4);
108 R7 = _mm_unpackhi_epi64(R5, R5);
110 R7 = mm_set1_epu32(0xFFFFFFFFU);
114 R1 = mm_set1_epu32(0);
115 R0 = _mm_unpacklo_epi8(R5, R1);
122 R6 = _mm_slli_epi16(R6, dataShift);
123 R1 = mm_set1_epu8(mask);
124 R6 = _mm_and_si128(R6, R1);
127 R1 = _mm_unpackhi_epi8(R6, R6);
128 R1 = _mm_srai_epi16(R1, 8);
131 R2 = _mm_unpacklo_epi8(R6, R6);
132 R2 = _mm_srai_epi16(R2, 8);
135 R6 = _mm_subs_epi16(R0, R2);
137 R3 = _mm_adds_epi16(R6, R1);
140 R4 = _mm_adds_epi16(R0, R2);
143 R5 = _mm_subs_epi16(R6, R1);
146 R0 = _mm_packus_epi16(R3, R5);
149 R1 = _mm_packus_epi16(R4, R4);
152 R1 = _mm_unpackhi_epi64(R1, R7);
155 R2 = _mm_unpacklo_epi8(R0, R1);
157 R3 = _mm_unpackhi_epi8(R0, R1);
159 R4 = _mm_unpacklo_epi16(R2, R3);
161 R5 = _mm_unpackhi_epi16(R2, R3);
163 STORE_SI128(dptr, R4);
165 STORE_SI128(dptr, R5);
173 pstatus_t status = 0;
174 status =
generic->YCoCgToRGB_8u_AC4R(
175 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
176 WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
178 if (status != PRIMITIVES_SUCCESS)
181 sptr += w *
sizeof(UINT32);
182 dptr += w *
sizeof(UINT32);
189 return PRIMITIVES_SUCCESS;
193static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
const BYTE* WINPR_RESTRICT pSrc,
194 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
195 UINT32 DstFormat, UINT32 dstStep, UINT32 width,
196 UINT32 height, UINT8 shift, BOOL withAlpha)
198 const BYTE* sptr = pSrc;
200 size_t sRowBump = srcStep - width *
sizeof(UINT32);
201 size_t dRowBump = dstStep - width *
sizeof(UINT32);
205 int dataShift = shift - 1;
206 BYTE mask = (BYTE)(0xFFU << dataShift);
217 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
220 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
221 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
222 width, height, shift, withAlpha);
225 for (UINT32 h = 0; h < height; h++)
234 __m128i R0 = LOAD_SI128(sptr);
236 __m128i R1 = LOAD_SI128(sptr);
242 __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
243 __m128i R3 = _mm_shuffle_epi8(R0, R2);
244 __m128i R4 = _mm_shuffle_epi8(R1, R2);
247 __m128i R5 = _mm_unpackhi_epi32(R3, R4);
248 __m128i R6 = _mm_unpacklo_epi32(R3, R4);
254 R7 = _mm_unpackhi_epi64(R5, R5);
256 R7 = mm_set1_epu32(0xFFFFFFFFU);
260 R1 = mm_set1_epu32(0);
261 R0 = _mm_unpacklo_epi8(R5, R1);
268 R6 = _mm_slli_epi16(R6, dataShift);
269 R1 = mm_set1_epu8(mask);
270 R6 = _mm_and_si128(R6, R1);
273 R1 = _mm_unpackhi_epi8(R6, R6);
274 R1 = _mm_srai_epi16(R1, 8);
277 R2 = _mm_unpacklo_epi8(R6, R6);
278 R2 = _mm_srai_epi16(R2, 8);
281 R6 = _mm_subs_epi16(R0, R2);
283 R3 = _mm_adds_epi16(R6, R1);
286 R4 = _mm_adds_epi16(R0, R2);
289 R5 = _mm_subs_epi16(R6, R1);
296 R0 = _mm_packus_epi16(R5, R3);
299 R1 = _mm_packus_epi16(R4, R4);
302 R1 = _mm_unpackhi_epi64(R1, R7);
305 R2 = _mm_unpacklo_epi8(R0, R1);
307 R3 = _mm_unpackhi_epi8(R0, R1);
309 R4 = _mm_unpacklo_epi16(R2, R3);
311 R5 = _mm_unpackhi_epi16(R2, R3);
313 STORE_SI128(dptr, R4);
315 STORE_SI128(dptr, R5);
323 pstatus_t status = 0;
324 status =
generic->YCoCgToRGB_8u_AC4R(
325 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
326 WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
329 if (status != PRIMITIVES_SUCCESS)
332 sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) *
sizeof(UINT32);
333 dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) *
sizeof(UINT32);
340 return PRIMITIVES_SUCCESS;
344static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
345 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
346 INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
351 case PIXEL_FORMAT_BGRX32:
352 case PIXEL_FORMAT_BGRA32:
353 return ssse3_YCoCgRToRGB_8u_AC4R_invert(
354 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
355 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
357 case PIXEL_FORMAT_RGBX32:
358 case PIXEL_FORMAT_RGBA32:
359 return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
360 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
361 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
364 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
365 height, shift, withAlpha);
372void primitives_init_YCoCg_ssse3_int(
primitives_t* WINPR_RESTRICT prims)
374#if defined(SSE_AVX_INTRINSICS_ENABLED)
375 generic = primitives_get_generic();
377 WLog_VRB(PRIM_TAG,
"SSE3/SSSE3 optimizations");
378 prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
380 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE2 intrinsics not available");