20#include <freerdp/config.h>
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
26#include "prim_colors.h"
28#include "prim_internal.h"
29#include "prim_templates.h"
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
36#define CACHE_LINE_BYTES 64
43static const int32_t ycbcr_table[][4] = { { 1, 0, -1, 2 },
49 { 90, -22, -46, 113 },
50 { 180, -44, -91, 227 },
51 { 359, -88, -183, 453 },
52 { 718, -176, -366, 906 },
53 { 1437, -352, -731, 1812 },
54 { 2873, -705, -1462, 3625 },
55 { 5747, -1409, -2925, 7250 },
56 { 11493, -2818, -5849, 14500 },
57 { 22987, -5636, -11698, 29000 },
58 { 45974, -11272, -23396, 57999 },
59 { 91947, -22544, -46793, 115999 },
60 { 183894, -45089, -93585, 231997 },
61 { 367788, -90178, -187171, 463995 },
62 { 735576, -180355, -374342, 927990 },
63 { 1471152, -360710, -748683, 1855980 },
64 { 2942304, -721420, -1497367, 3711959 },
65 { 5884609, -1442841, -2994733, 7423918 },
66 { 11769217, -2885681, -5989466, 14847836 },
67 { 23538434, -5771362, -11978932, 29695672 },
68 { 47076868, -11542725, -23957864, 59391345 },
69 { 94153736, -23085449, -47915729, 118782689 },
70 { 188307472, -46170898, -95831458, 237565379 },
71 { 376614945, -92341797, -191662916, 475130757 },
72 { 753229890, -184683594, -383325831, 950261514 },
73 { 1506459779, -369367187, -766651662, 1900523028 } };
75static inline __m128i mm_between_epi16_int(__m128i val, __m128i min, __m128i max)
77 return _mm_min_epi16(max, _mm_max_epi16(val, min));
80#define mm_between_epi16(_val, _min, _max) (_val) = mm_between_epi16_int((_val), (_min), (_max))
82static inline void mm_prefetch_buffer(
const void* WINPR_RESTRICT buffer,
size_t width,
83 size_t stride,
size_t height)
85 const size_t srcbump = stride /
sizeof(__m128i);
86 const __m128i* buf = (
const __m128i*)buffer;
88 for (
size_t y = 0; y < height; y++)
90 const __m128i* line = &buf[y * srcbump];
91 for (
size_t x = 0; x < width *
sizeof(INT16) /
sizeof(__m128i);
92 x += (CACHE_LINE_BYTES /
sizeof(__m128i)))
94 const char* ptr = (
const char*)&line[x];
95 _mm_prefetch(ptr, _MM_HINT_NTA);
102sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(
const INT16* WINPR_RESTRICT pSrc[3],
103 WINPR_ATTR_UNUSED UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
107 const __m128i zero = _mm_setzero_si128();
108 const __m128i max = _mm_set1_epi16(255);
110 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0]));
112 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1]));
114 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2]));
116 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3]));
117 const __m128i c4096 = _mm_set1_epi16(4096);
118 const INT16* y_buf = pSrc[0];
119 const INT16* cb_buf = pSrc[1];
120 const INT16* cr_buf = pSrc[2];
121 const UINT32 pad = roi->width % 16;
122 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
123 const size_t imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
125 const size_t dstPad = (dstStep - roi->width * 4);
127 mm_prefetch_buffer(y_buf, roi->width, (
size_t)srcStep, roi->height);
128 mm_prefetch_buffer(cr_buf, roi->width, (
size_t)srcStep, roi->height);
129 mm_prefetch_buffer(cb_buf, roi->width, (
size_t)srcStep, roi->height);
131 for (UINT32 yp = 0; yp < roi->height; ++yp)
133 for (
size_t i = 0; i < imax; i += 2)
155 __m128i y1 = LOAD_SI128(y_buf);
157 y1 = _mm_add_epi16(y1, c4096);
158 y1 = _mm_srai_epi16(y1, 2);
160 __m128i cb1 = LOAD_SI128(cb_buf);
163 __m128i cr1 = LOAD_SI128(cr_buf);
166 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
167 r1 = _mm_srai_epi16(r1, 3);
169 mm_between_epi16(r1, zero, max);
171 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
172 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
173 g1 = _mm_srai_epi16(g1, 3);
175 mm_between_epi16(g1, zero, max);
177 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
178 b1 = _mm_srai_epi16(b1, 3);
180 mm_between_epi16(b1, zero, max);
181 __m128i y2 = LOAD_SI128(y_buf);
183 y2 = _mm_add_epi16(y2, c4096);
184 y2 = _mm_srai_epi16(y2, 2);
186 __m128i cb2 = LOAD_SI128(cb_buf);
189 __m128i cr2 = LOAD_SI128(cr_buf);
192 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
193 r2 = _mm_srai_epi16(r2, 3);
195 mm_between_epi16(r2, zero, max);
197 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
198 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
199 g2 = _mm_srai_epi16(g2, 3);
201 mm_between_epi16(g2, zero, max);
203 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
204 b2 = _mm_srai_epi16(b2, 3);
206 mm_between_epi16(b2, zero, max);
213 R0 = _mm_packus_epi16(R0, R1);
216 R1 = _mm_packus_epi16(R1, R2);
218 R2 = _mm_unpacklo_epi8(R0, R2);
219 R1 = _mm_unpackhi_epi8(R0, R1);
222 R0 = _mm_packus_epi16(R0, R3);
223 R3 = mm_set1_epu32(0xFFFFFFFFU);
225 R4 = _mm_unpacklo_epi8(R0, R4);
226 R3 = _mm_unpackhi_epi8(R0, R3);
228 R0 = _mm_unpacklo_epi16(R2, R0);
229 R4 = _mm_unpackhi_epi16(R2, R4);
231 R2 = _mm_unpacklo_epi16(R1, R2);
232 R3 = _mm_unpackhi_epi16(R1, R3);
233 STORE_SI128(d_buf, R0);
234 d_buf +=
sizeof(__m128i);
235 STORE_SI128(d_buf, R4);
236 d_buf +=
sizeof(__m128i);
237 STORE_SI128(d_buf, R2);
238 d_buf +=
sizeof(__m128i);
239 STORE_SI128(d_buf, R3);
240 d_buf +=
sizeof(__m128i);
244 for (UINT32 i = 0; i < pad; i++)
246 const INT32 divisor = 16;
247 const INT32 Y = ((*y_buf++) + 4096) << divisor;
248 const INT32 Cb = (*cb_buf++);
249 const INT32 Cr = (*cr_buf++);
250 const INT32 CrR = Cr * ycbcr_table[divisor][0];
251 const INT32 CrG = Cr * ycbcr_table[divisor][1];
252 const INT32 CbG = Cb * ycbcr_table[divisor][2];
253 const INT32 CbB = Cb * ycbcr_table[divisor][3];
254 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
255 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
256 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
266 return PRIMITIVES_SUCCESS;
271sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(
const INT16* WINPR_RESTRICT pSrc[3],
272 WINPR_ATTR_UNUSED UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
276 const __m128i zero = _mm_setzero_si128();
277 const __m128i max = _mm_set1_epi16(255);
279 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0]));
281 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1]));
283 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2]));
285 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3]));
286 const __m128i c4096 = _mm_set1_epi16(4096);
287 const INT16* y_buf = pSrc[0];
288 const INT16* cb_buf = pSrc[1];
289 const INT16* cr_buf = pSrc[2];
290 const UINT32 pad = roi->width % 16;
291 const UINT32 step =
sizeof(__m128i) /
sizeof(INT16);
292 const size_t imax = (roi->width - pad) *
sizeof(INT16) /
sizeof(__m128i);
294 const size_t dstPad = (dstStep - roi->width * 4);
296 mm_prefetch_buffer(y_buf, roi->width, (
size_t)srcStep, roi->height);
297 mm_prefetch_buffer(cb_buf, roi->width, (
size_t)srcStep, roi->height);
298 mm_prefetch_buffer(cr_buf, roi->width, (
size_t)srcStep, roi->height);
300 for (UINT32 yp = 0; yp < roi->height; ++yp)
302 for (
size_t i = 0; i < imax; i += 2)
324 __m128i y1 = LOAD_SI128(y_buf);
326 y1 = _mm_add_epi16(y1, c4096);
327 y1 = _mm_srai_epi16(y1, 2);
329 __m128i cb1 = LOAD_SI128(cb_buf);
332 __m128i cr1 = LOAD_SI128(cr_buf);
335 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
336 r1 = _mm_srai_epi16(r1, 3);
338 mm_between_epi16(r1, zero, max);
340 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
341 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
342 g1 = _mm_srai_epi16(g1, 3);
344 mm_between_epi16(g1, zero, max);
346 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
347 b1 = _mm_srai_epi16(b1, 3);
349 mm_between_epi16(b1, zero, max);
350 __m128i y2 = LOAD_SI128(y_buf);
352 y2 = _mm_add_epi16(y2, c4096);
353 y2 = _mm_srai_epi16(y2, 2);
355 __m128i cb2 = LOAD_SI128(cb_buf);
358 __m128i cr2 = LOAD_SI128(cr_buf);
361 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
362 r2 = _mm_srai_epi16(r2, 3);
364 mm_between_epi16(r2, zero, max);
366 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
367 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
368 g2 = _mm_srai_epi16(g2, 3);
370 mm_between_epi16(g2, zero, max);
372 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
373 b2 = _mm_srai_epi16(b2, 3);
375 mm_between_epi16(b2, zero, max);
382 R0 = _mm_packus_epi16(R0, R1);
385 R1 = _mm_packus_epi16(R1, R2);
387 R2 = _mm_unpacklo_epi8(R0, R2);
388 R1 = _mm_unpackhi_epi8(R0, R1);
391 R0 = _mm_packus_epi16(R0, R3);
392 R3 = mm_set1_epu32(0xFFFFFFFFU);
394 R4 = _mm_unpacklo_epi8(R0, R4);
395 R3 = _mm_unpackhi_epi8(R0, R3);
397 R0 = _mm_unpacklo_epi16(R2, R0);
398 R4 = _mm_unpackhi_epi16(R2, R4);
400 R2 = _mm_unpacklo_epi16(R1, R2);
401 R3 = _mm_unpackhi_epi16(R1, R3);
402 STORE_SI128(d_buf, R0);
403 d_buf +=
sizeof(__m128i);
404 STORE_SI128(d_buf, R4);
405 d_buf +=
sizeof(__m128i);
406 STORE_SI128(d_buf, R2);
407 d_buf +=
sizeof(__m128i);
408 STORE_SI128(d_buf, R3);
409 d_buf +=
sizeof(__m128i);
413 for (UINT32 i = 0; i < pad; i++)
415 const INT32 divisor = 16;
416 const INT32 Y = ((*y_buf++) + 4096) << divisor;
417 const INT32 Cb = (*cb_buf++);
418 const INT32 Cr = (*cr_buf++);
419 const INT32 CrR = Cr * ycbcr_table[divisor][0];
420 const INT32 CrG = Cr * ycbcr_table[divisor][1];
421 const INT32 CbG = Cb * ycbcr_table[divisor][2];
422 const INT32 CbB = Cb * ycbcr_table[divisor][3];
423 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
424 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
425 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
435 return PRIMITIVES_SUCCESS;
439sse2_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
440 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
443 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
444 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
448 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
453 case PIXEL_FORMAT_BGRA32:
454 case PIXEL_FORMAT_BGRX32:
455 return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
457 case PIXEL_FORMAT_RGBA32:
458 case PIXEL_FORMAT_RGBX32:
459 return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
462 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
469sse2_RGBToYCbCr_16s16s_P3P3(
const INT16* WINPR_RESTRICT pSrc[3],
int srcStep,
470 INT16* WINPR_RESTRICT pDst[3],
int dstStep,
473 const __m128i* r_buf = (
const __m128i*)(pSrc[0]);
474 const __m128i* g_buf = (
const __m128i*)(pSrc[1]);
475 const __m128i* b_buf = (
const __m128i*)(pSrc[2]);
476 __m128i* y_buf = (__m128i*)(pDst[0]);
477 __m128i* cb_buf = (__m128i*)(pDst[1]);
478 __m128i* cr_buf = (__m128i*)(pDst[2]);
480 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
481 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
482 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
483 (srcStep & 127) || (dstStep & 127))
486 return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
489 const __m128i min = _mm_set1_epi16(-128 * 32);
490 const __m128i max = _mm_set1_epi16(127 * 32);
492 __m128i y_r = _mm_set1_epi16(9798);
493 __m128i y_g = _mm_set1_epi16(19235);
494 __m128i y_b = _mm_set1_epi16(3735);
495 __m128i cb_r = _mm_set1_epi16(-5535);
496 __m128i cb_g = _mm_set1_epi16(-10868);
497 __m128i cb_b = _mm_set1_epi16(16403);
498 __m128i cr_r = _mm_set1_epi16(16377);
499 __m128i cr_g = _mm_set1_epi16(-13714);
500 __m128i cr_b = _mm_set1_epi16(-2663);
501 const size_t srcbump = WINPR_ASSERTING_INT_CAST(
size_t, srcStep) /
sizeof(__m128i);
502 const size_t dstbump = WINPR_ASSERTING_INT_CAST(
size_t, dstStep) /
sizeof(__m128i);
504 mm_prefetch_buffer(r_buf, roi->width, (
size_t)srcStep, roi->height);
505 mm_prefetch_buffer(g_buf, roi->width, (
size_t)srcStep, roi->height);
506 mm_prefetch_buffer(b_buf, roi->width, (
size_t)srcStep, roi->height);
508 const size_t imax = roi->width *
sizeof(INT16) /
sizeof(__m128i);
510 for (UINT32 yp = 0; yp < roi->height; ++yp)
512 for (
size_t i = 0; i < imax; i++)
525 __m128i r = LOAD_SI128(r_buf + i);
526 __m128i g = LOAD_SI128(g_buf + i);
527 __m128i b = LOAD_SI128(b_buf + i);
529 r = _mm_slli_epi16(r, 6);
530 g = _mm_slli_epi16(g, 6);
531 b = _mm_slli_epi16(b, 6);
533 __m128i y = _mm_mulhi_epi16(r, y_r);
534 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
535 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
536 y = _mm_add_epi16(y, min);
538 mm_between_epi16(y, min, max);
539 STORE_SI128(y_buf + i, y);
541 __m128i cb = _mm_mulhi_epi16(r, cb_r);
542 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
543 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
545 mm_between_epi16(cb, min, max);
546 STORE_SI128(cb_buf + i, cb);
548 __m128i cr = _mm_mulhi_epi16(r, cr_r);
549 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
550 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
552 mm_between_epi16(cr, min, max);
553 STORE_SI128(cr_buf + i, cr);
564 return PRIMITIVES_SUCCESS;
568static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
569 const INT16* WINPR_RESTRICT pSrc[3],
571 BYTE* WINPR_RESTRICT pDst,
575 const UINT16* pr = (
const UINT16*)(pSrc[0]);
576 const UINT16* pg = (
const UINT16*)(pSrc[1]);
577 const UINT16* pb = (
const UINT16*)(pSrc[2]);
578 const UINT32 pad = roi->width % 16;
579 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
584 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
585 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
587 for (UINT32 y = 0; y < roi->height; ++y)
589 for (UINT32 x = 0; x < roi->width - pad; x += 16)
604 b = _mm_packus_epi16(R0, R1);
613 g = _mm_packus_epi16(R0, R1);
622 r = _mm_packus_epi16(R0, R1);
625 const __m128i gbLo = _mm_unpacklo_epi8(b, g);
626 const __m128i gbHi = _mm_unpackhi_epi8(b, g);
627 const __m128i arLo = _mm_unpacklo_epi8(r, a);
628 const __m128i arHi = _mm_unpackhi_epi8(r, a);
631 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
632 STORE_SI128(out, bgrx);
636 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
637 STORE_SI128(out, bgrx);
641 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
642 STORE_SI128(out, bgrx);
646 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
647 STORE_SI128(out, bgrx);
653 for (UINT32 x = 0; x < pad; x++)
655 const BYTE R = CLIP(*pr++);
656 const BYTE G = CLIP(*pg++);
657 const BYTE B = CLIP(*pb++);
671 return PRIMITIVES_SUCCESS;
674static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
675 const INT16* WINPR_RESTRICT pSrc[3],
677 BYTE* WINPR_RESTRICT pDst,
681 const UINT16* pr = (
const UINT16*)(pSrc[0]);
682 const UINT16* pg = (
const UINT16*)(pSrc[1]);
683 const UINT16* pb = (
const UINT16*)(pSrc[2]);
684 const UINT32 pad = roi->width % 16;
685 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
690 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
691 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
693 for (UINT32 y = 0; y < roi->height; ++y)
695 for (UINT32 x = 0; x < roi->width - pad; x += 16)
710 b = _mm_packus_epi16(R0, R1);
719 g = _mm_packus_epi16(R0, R1);
728 r = _mm_packus_epi16(R0, R1);
736 gbLo = _mm_unpacklo_epi8(r, g);
737 gbHi = _mm_unpackhi_epi8(r, g);
738 arLo = _mm_unpacklo_epi8(b, a);
739 arHi = _mm_unpackhi_epi8(b, a);
742 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
743 STORE_SI128(out, bgrx);
747 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
748 STORE_SI128(out, bgrx);
752 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
753 STORE_SI128(out, bgrx);
757 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
758 STORE_SI128(out, bgrx);
764 for (UINT32 x = 0; x < pad; x++)
766 const BYTE R = CLIP(*pr++);
767 const BYTE G = CLIP(*pg++);
768 const BYTE B = CLIP(*pb++);
782 return PRIMITIVES_SUCCESS;
785static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
786 const INT16* WINPR_RESTRICT pSrc[3],
788 BYTE* WINPR_RESTRICT pDst,
792 const UINT16* pr = (
const UINT16*)(pSrc[0]);
793 const UINT16* pg = (
const UINT16*)(pSrc[1]);
794 const UINT16* pb = (
const UINT16*)(pSrc[2]);
795 const UINT32 pad = roi->width % 16;
796 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
801 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
802 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
804 for (UINT32 y = 0; y < roi->height; ++y)
806 for (UINT32 x = 0; x < roi->width - pad; x += 16)
821 b = _mm_packus_epi16(R0, R1);
830 g = _mm_packus_epi16(R0, R1);
839 r = _mm_packus_epi16(R0, R1);
847 gbLo = _mm_unpacklo_epi8(a, b);
848 gbHi = _mm_unpackhi_epi8(a, b);
849 arLo = _mm_unpacklo_epi8(g, r);
850 arHi = _mm_unpackhi_epi8(g, r);
853 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
854 STORE_SI128(out, bgrx);
858 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
859 STORE_SI128(out, bgrx);
863 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
864 STORE_SI128(out, bgrx);
868 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
869 STORE_SI128(out, bgrx);
875 for (UINT32 x = 0; x < pad; x++)
877 const BYTE R = CLIP(*pr++);
878 const BYTE G = CLIP(*pg++);
879 const BYTE B = CLIP(*pb++);
893 return PRIMITIVES_SUCCESS;
896static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
897 const INT16* WINPR_RESTRICT pSrc[3],
899 BYTE* WINPR_RESTRICT pDst,
903 const UINT16* pr = (
const UINT16*)(pSrc[0]);
904 const UINT16* pg = (
const UINT16*)(pSrc[1]);
905 const UINT16* pb = (
const UINT16*)(pSrc[2]);
906 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
907 const UINT32 pad = roi->width % 16;
912 srcbump = (srcStep - (roi->width *
sizeof(UINT16))) /
sizeof(UINT16);
913 dstbump = (dstStep - (roi->width *
sizeof(UINT32)));
915 for (UINT32 y = 0; y < roi->height; ++y)
917 for (UINT32 x = 0; x < roi->width - pad; x += 16)
932 b = _mm_packus_epi16(R0, R1);
941 g = _mm_packus_epi16(R0, R1);
950 r = _mm_packus_epi16(R0, R1);
958 gbLo = _mm_unpacklo_epi8(a, r);
959 gbHi = _mm_unpackhi_epi8(a, r);
960 arLo = _mm_unpacklo_epi8(g, b);
961 arHi = _mm_unpackhi_epi8(g, b);
964 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
965 STORE_SI128(out, bgrx);
969 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
970 STORE_SI128(out, bgrx);
974 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
975 STORE_SI128(out, bgrx);
979 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
980 STORE_SI128(out, bgrx);
986 for (UINT32 x = 0; x < pad; x++)
988 const BYTE R = CLIP(*pr++);
989 const BYTE G = CLIP(*pg++);
990 const BYTE B = CLIP(*pb++);
1004 return PRIMITIVES_SUCCESS;
1008sse2_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3],
1010 BYTE* WINPR_RESTRICT pDst,
1012 UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
1014 if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1015 (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1016 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1020 case PIXEL_FORMAT_BGRA32:
1021 case PIXEL_FORMAT_BGRX32:
1022 return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1024 case PIXEL_FORMAT_RGBA32:
1025 case PIXEL_FORMAT_RGBX32:
1026 return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1028 case PIXEL_FORMAT_ABGR32:
1029 case PIXEL_FORMAT_XBGR32:
1030 return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1032 case PIXEL_FORMAT_ARGB32:
1033 case PIXEL_FORMAT_XRGB32:
1034 return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1037 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1042void primitives_init_colors_sse2_int(
primitives_t* WINPR_RESTRICT prims)
1044#if defined(SSE_AVX_INTRINSICS_ENABLED)
1045 generic = primitives_get_generic();
1047 WLog_VRB(PRIM_TAG,
"SSE2 optimizations");
1048 prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1049 prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1050 prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1053 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE2 intrinsics not available");
1054 WINPR_UNUSED(prims);