23#include <winpr/wtypes.h>
24#include <freerdp/config.h>
26#include <winpr/sysinfo.h>
28#include <freerdp/types.h>
29#include <freerdp/primitives.h>
31#include "prim_internal.h"
32#include "prim_avxsse.h"
35#if defined(SSE_AVX_INTRINSICS_ENABLED)
45static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
46 __m128i Vraw, UINT8 pos)
48 const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
49 mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
50 mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
51 mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
52 const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
53 mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
54 mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
55 mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
56 const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
57 mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
58 mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
59 const __m128i c128 = _mm_set1_epi16(128);
60 __m128i BGRX = _mm_and_si128(LOAD_SI128(dst),
61 mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
68 C = _mm_shuffle_epi8(Yraw, mapY[pos]);
72 const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]);
73 D = _mm_sub_epi16(U, c128);
77 const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]);
78 E = _mm_sub_epi16(V, c128);
82 const __m128i c403 = _mm_set1_epi16(403);
84 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
85 const __m128i Rs = _mm_add_epi32(C, e403);
86 const __m128i R32 = _mm_srai_epi32(Rs, 8);
87 const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
88 const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
89 const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
90 BGRX = _mm_or_si128(BGRX, packed);
94 const __m128i c48 = _mm_set1_epi16(48);
96 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
97 const __m128i c120 = _mm_set1_epi16(120);
99 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
100 const __m128i de = _mm_add_epi32(d48, e120);
101 const __m128i Gs = _mm_sub_epi32(C, de);
102 const __m128i G32 = _mm_srai_epi32(Gs, 8);
103 const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
104 const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
105 const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
106 BGRX = _mm_or_si128(BGRX, packed);
110 const __m128i c475 = _mm_set1_epi16(475);
112 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
113 const __m128i Bs = _mm_add_epi32(C, d475);
114 const __m128i B32 = _mm_srai_epi32(Bs, 8);
115 const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
116 const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
117 const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
118 BGRX = _mm_or_si128(BGRX, packed);
121 STORE_SI128(dst++, BGRX);
125static inline pstatus_t sse41_YUV420ToRGB_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
126 const UINT32* WINPR_RESTRICT srcStep,
127 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
130 const UINT32 nWidth = roi->width;
131 const UINT32 nHeight = roi->height;
132 const UINT32 pad = roi->width % 16;
133 const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
135 for (
size_t y = 0; y < nHeight; y++)
137 __m128i* dst = (__m128i*)(pDst + dstStep * y);
138 const BYTE* YData = pSrc[0] + y * srcStep[0];
139 const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
140 const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
142 for (UINT32 x = 0; x < nWidth - pad; x += 16)
144 const __m128i Y = LOAD_SI128(YData);
145 const __m128i uRaw = LOAD_SI128(UData);
146 const __m128i vRaw = LOAD_SI128(VData);
147 const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
148 const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
152 dst = sse41_YUV444Pixel(dst, Y, U, V, 0);
153 dst = sse41_YUV444Pixel(dst, Y, U, V, 1);
154 dst = sse41_YUV444Pixel(dst, Y, U, V, 2);
155 dst = sse41_YUV444Pixel(dst, Y, U, V, 3);
158 for (UINT32 x = 0; x < pad; x++)
160 const BYTE Y = *YData++;
161 const BYTE U = *UData;
162 const BYTE V = *VData;
163 dst = (__m128i*)writeYUVPixel((BYTE*)dst, PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
173 return PRIMITIVES_SUCCESS;
176static pstatus_t sse41_YUV420ToRGB(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
177 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
182 case PIXEL_FORMAT_BGRX32:
183 case PIXEL_FORMAT_BGRA32:
184 return sse41_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
187 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
191static inline void BGRX_fillRGB(
size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
192 const BYTE* WINPR_RESTRICT pY[2],
const BYTE* WINPR_RESTRICT pU[2],
193 const BYTE* WINPR_RESTRICT pV[2], BOOL filter)
200 const UINT32 DstFormat = PIXEL_FORMAT_BGRX32;
201 const UINT32 bpp = 4;
203 for (
size_t i = 0; i < 2; i++)
205 for (
size_t j = 0; j < 2; j++)
207 const BYTE Y = pY[i][offset + j];
208 BYTE U = pU[i][offset + j];
209 BYTE V = pV[i][offset + j];
210 if ((i == 0) && (j == 0) && filter)
213 4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1];
215 4 * pV[0][offset] - pV[0][offset + 1] - pV[1][offset] - pV[1][offset + 1];
217 U = CONDITIONAL_CLIP(avgU, pU[0][offset]);
218 V = CONDITIONAL_CLIP(avgV, pV[0][offset]);
221 writeYUVPixel(&pRGB[i][(j + offset) * bpp], DstFormat, Y, U, V, writePixelBGRX);
227static inline __m128i sse41_yuv2x_single(
const __m128i Y, __m128i U, __m128i V,
const short iMulU,
230 const __m128i zero = _mm_set1_epi8(0);
232 __m128i Ylo = _mm_unpacklo_epi16(Y, zero);
233 __m128i Yhi = _mm_unpackhi_epi16(Y, zero);
236 const __m128i addX = _mm_set1_epi16(128);
237 const __m128i D = _mm_sub_epi16(U, addX);
238 const __m128i mulU = _mm_set1_epi16(iMulU);
239 const __m128i mulDlo = _mm_mullo_epi16(D, mulU);
240 const __m128i mulDhi = _mm_mulhi_epi16(D, mulU);
241 const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi);
242 Ylo = _mm_add_epi32(Ylo, Dlo);
244 const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi);
245 Yhi = _mm_add_epi32(Yhi, Dhi);
249 const __m128i addX = _mm_set1_epi16(128);
250 const __m128i E = _mm_sub_epi16(V, addX);
251 const __m128i mul = _mm_set1_epi16(iMulV);
252 const __m128i mulElo = _mm_mullo_epi16(E, mul);
253 const __m128i mulEhi = _mm_mulhi_epi16(E, mul);
254 const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi);
255 const __m128i esumlo = _mm_add_epi32(Ylo, Elo);
257 const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi);
258 const __m128i esumhi = _mm_add_epi32(Yhi, Ehi);
263 const __m128i rYlo = _mm_srai_epi32(Ylo, 8);
264 const __m128i rYhi = _mm_srai_epi32(Yhi, 8);
265 const __m128i rY = _mm_packs_epi32(rYlo, rYhi);
270static inline __m128i sse41_yuv2x(
const __m128i Y, __m128i U, __m128i V,
const short iMulU,
273 const __m128i zero = _mm_set1_epi8(0);
279 const __m128i Ylo = _mm_unpacklo_epi8(zero, Y);
280 const __m128i Ulo = _mm_unpacklo_epi8(U, zero);
281 const __m128i Vlo = _mm_unpacklo_epi8(V, zero);
282 const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV);
284 const __m128i Yhi = _mm_unpackhi_epi8(zero, Y);
285 const __m128i Uhi = _mm_unpackhi_epi8(U, zero);
286 const __m128i Vhi = _mm_unpackhi_epi8(V, zero);
287 const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV);
288 const __m128i res = _mm_packus_epi16(preslo, preshi);
294static inline __m128i sse41_yuv2r(
const __m128i Y, __m128i U, __m128i V)
296 return sse41_yuv2x(Y, U, V, 0, 403);
300static inline __m128i sse41_yuv2g(
const __m128i Y, __m128i U, __m128i V)
302 return sse41_yuv2x(Y, U, V, -48, -120);
306static inline __m128i sse41_yuv2b(
const __m128i Y, __m128i U, __m128i V)
308 return sse41_yuv2x(Y, U, V, 475, 0);
311static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y, __m128i U,
314 const __m128i zero = _mm_set1_epi8(0);
316 const __m128i r = sse41_yuv2r(Y, U, V);
317 const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) };
319 const __m128i g = sse41_yuv2g(Y, U, V);
320 const __m128i b = sse41_yuv2b(Y, U, V);
322 const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) };
324 const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF,
325 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF);
327 __m128i* rgb = (__m128i*)pRGB;
328 const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]);
329 _mm_maskmoveu_si128(bgrx0, mask, (
char*)&rgb[0]);
330 const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]);
331 _mm_maskmoveu_si128(bgrx1, mask, (
char*)&rgb[1]);
332 const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]);
333 _mm_maskmoveu_si128(bgrx2, mask, (
char*)&rgb[2]);
334 const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]);
335 _mm_maskmoveu_si128(bgrx3, mask, (
char*)&rgb[3]);
338static inline __m128i odd1sum(__m128i u1)
340 const __m128i zero = _mm_set1_epi8(0);
341 const __m128i u1hi = _mm_unpackhi_epi8(u1, zero);
342 const __m128i u1lo = _mm_unpacklo_epi8(u1, zero);
343 return _mm_hadds_epi16(u1lo, u1hi);
346static inline __m128i odd0sum(__m128i u0, __m128i u1sum)
350 const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07,
351 0x80, 0x05, 0x80, 0x03, 0x80, 0x01);
352 const __m128i u0odd = _mm_shuffle_epi8(u0, mask);
353 return _mm_adds_epi16(u1sum, u0odd);
356static inline __m128i calcavg(__m128i u0even, __m128i sum)
358 const __m128i u4zero = _mm_slli_epi16(u0even, 2);
359 const __m128i uavg = _mm_sub_epi16(u4zero, sum);
360 const __m128i zero = _mm_set1_epi8(0);
361 const __m128i savg = _mm_packus_epi16(uavg, zero);
362 const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03,
363 0x80, 0x02, 0x80, 0x01, 0x80, 0x00);
364 return _mm_shuffle_epi8(savg, smask);
367static inline __m128i diffmask(__m128i avg, __m128i u0even)
372 const __m128i diff = _mm_subs_epi16(u0even, avg);
373 const __m128i absdiff = _mm_abs_epi16(diff);
374 const __m128i val30 = _mm_set1_epi16(30);
375 return _mm_cmplt_epi16(absdiff, val30);
378static inline void sse41_filter(__m128i pU[2])
380 const __m128i u1sum = odd1sum(pU[1]);
381 const __m128i sum = odd0sum(pU[0], u1sum);
384 const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
385 0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
386 const __m128i u0even = _mm_and_si128(pU[0], emask);
387 const __m128i avg = calcavg(u0even, sum);
388 const __m128i umask = diffmask(avg, u0even);
390 const __m128i u0orig = _mm_and_si128(u0even, umask);
391 const __m128i u0avg = _mm_andnot_si128(umask, avg);
392 const __m128i evenresult = _mm_or_si128(u0orig, u0avg);
393 const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
394 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00);
395 const __m128i u0odd = _mm_and_si128(pU[0], omask);
396 const __m128i result = _mm_or_si128(evenresult, u0odd);
400static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2],
const __m128i pY[2],
401 __m128i pU[2], __m128i pV[2])
411 for (
size_t i = 0; i < 2; i++)
413 sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]);
417static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
418 BYTE* WINPR_RESTRICT pDst[2],
const BYTE* WINPR_RESTRICT YData[2],
419 const BYTE* WINPR_RESTRICT UData[2],
const BYTE* WINPR_RESTRICT VData[2], UINT32 nWidth)
421 WINPR_ASSERT((nWidth % 2) == 0);
422 const UINT32 pad = nWidth % 16;
425 for (; x < nWidth - pad; x += 16)
427 const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) };
428 __m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) };
429 __m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) };
431 BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
432 sse41_BGRX_fillRGB(dstp, Y, U, V);
435 for (; x < nWidth; x += 2)
437 BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE);
440 return PRIMITIVES_SUCCESS;
443static inline void BGRX_fillRGB_single(
size_t offset, BYTE* WINPR_RESTRICT pRGB,
444 const BYTE* WINPR_RESTRICT pY,
const BYTE* WINPR_RESTRICT pU,
445 const BYTE* WINPR_RESTRICT pV, WINPR_ATTR_UNUSED BOOL filter)
452 const UINT32 bpp = 4;
454 for (
size_t j = 0; j < 2; j++)
456 const BYTE Y = pY[offset + j];
457 BYTE U = pU[offset + j];
458 BYTE V = pV[offset + j];
460 writeYUVPixel(&pRGB[(j + offset) * bpp], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
464static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(
465 BYTE* WINPR_RESTRICT pDst,
const BYTE* WINPR_RESTRICT YData,
const BYTE* WINPR_RESTRICT UData,
466 const BYTE* WINPR_RESTRICT VData, UINT32 nWidth)
468 WINPR_ASSERT((nWidth % 2) == 0);
470 for (
size_t x = 0; x < nWidth; x += 2)
472 BGRX_fillRGB_single(x, pDst, YData, UData, VData, TRUE);
475 return PRIMITIVES_SUCCESS;
478static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX(
const BYTE* WINPR_RESTRICT pSrc[],
479 const UINT32 srcStep[],
480 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
483 const UINT32 nWidth = roi->width;
484 const UINT32 nHeight = roi->height;
487 for (; y < nHeight - nHeight % 2; y += 2)
489 BYTE* dst[] = { (pDst + dstStep * y), (pDst + dstStep * (y + 1)) };
490 const BYTE* YData[] = { pSrc[0] + y * srcStep[0], pSrc[0] + (y + 1) * srcStep[0] };
491 const BYTE* UData[] = { pSrc[1] + y * srcStep[1], pSrc[1] + (y + 1) * srcStep[1] };
492 const BYTE* VData[] = { pSrc[2] + y * srcStep[2], pSrc[2] + (y + 1) * srcStep[2] };
495 sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(dst, YData, UData, VData, nWidth);
496 if (rc != PRIMITIVES_SUCCESS)
499 for (; y < nHeight; y++)
501 BYTE* dst = (pDst + dstStep * y);
502 const BYTE* YData = pSrc[0] + y * srcStep[0];
503 const BYTE* UData = pSrc[1] + y * srcStep[1];
504 const BYTE* VData = pSrc[2] + y * srcStep[2];
506 sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(dst, YData, UData, VData, nWidth);
507 if (rc != PRIMITIVES_SUCCESS)
511 return PRIMITIVES_SUCCESS;
514static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(
const BYTE* WINPR_RESTRICT pSrc[],
515 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
516 UINT32 dstStep, UINT32 DstFormat,
521 case PIXEL_FORMAT_BGRX32:
522 case PIXEL_FORMAT_BGRA32:
523 return sse41_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
526 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
555#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
556#define BGRX_U_FACTORS \
557 _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
558#define BGRX_V_FACTORS \
559 _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
560#define CONST128_FACTORS _mm_set1_epi8(-128)
583static inline void sse41_BGRX_TO_YUV(
const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine,
584 BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine)
586 const BYTE r1 = pLine1[2];
587 const BYTE g1 = pLine1[1];
588 const BYTE b1 = pLine1[0];
591 pYLine[0] = RGB2Y(r1, g1, b1);
593 pULine[0] = RGB2U(r1, g1, b1);
595 pVLine[0] = RGB2V(r1, g1, b1);
600static INLINE
void sse41_RGBToYUV420_BGRX_Y(
const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
602 const __m128i y_factors = BGRX_Y_FACTORS;
603 const __m128i* argb = (
const __m128i*)src;
604 __m128i* ydst = (__m128i*)dst;
608 for (; x < width - width % 16; x += 16)
611 __m128i x0 = LOAD_SI128(argb++);
613 x0 = _mm_maddubs_epi16(x0, y_factors);
615 __m128i x1 = LOAD_SI128(argb++);
616 x1 = _mm_maddubs_epi16(x1, y_factors);
617 x0 = _mm_hadds_epi16(x0, x1);
618 x0 = _mm_srli_epi16(x0, Y_SHIFT);
621 __m128i x2 = LOAD_SI128(argb++);
623 x2 = _mm_maddubs_epi16(x2, y_factors);
625 __m128i x3 = LOAD_SI128(argb++);
626 x3 = _mm_maddubs_epi16(x3, y_factors);
627 x2 = _mm_hadds_epi16(x2, x3);
628 x2 = _mm_srli_epi16(x2, Y_SHIFT);
631 x0 = _mm_packus_epi16(x0, x2);
633 STORE_SI128(ydst++, x0);
636 for (; x < width; x++)
638 sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL);
644static INLINE
void sse41_RGBToYUV420_BGRX_UV(
const BYTE* WINPR_RESTRICT src1,
645 const BYTE* WINPR_RESTRICT src2,
646 BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
649 const __m128i u_factors = BGRX_U_FACTORS;
650 const __m128i v_factors = BGRX_V_FACTORS;
651 const __m128i vector128 = CONST128_FACTORS;
655 for (; x < width - width % 16; x += 16)
657 const __m128i* rgb1 = (
const __m128i*)&src1[4ULL * x];
658 const __m128i* rgb2 = (
const __m128i*)&src2[4ULL * x];
659 __m64* udst = (__m64*)&dst1[x / 2];
660 __m64* vdst = (__m64*)&dst2[x / 2];
663 __m128i x0 = LOAD_SI128(&rgb1[0]);
664 __m128i x4 = LOAD_SI128(&rgb2[0]);
665 x0 = _mm_avg_epu8(x0, x4);
667 __m128i x1 = LOAD_SI128(&rgb1[1]);
668 x4 = LOAD_SI128(&rgb2[1]);
669 x1 = _mm_avg_epu8(x1, x4);
671 __m128i x2 = LOAD_SI128(&rgb1[2]);
672 x4 = LOAD_SI128(&rgb2[2]);
673 x2 = _mm_avg_epu8(x2, x4);
675 __m128i x3 = LOAD_SI128(&rgb1[3]);
676 x4 = LOAD_SI128(&rgb2[3]);
677 x3 = _mm_avg_epu8(x3, x4);
685 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
686 x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
687 x0 = _mm_avg_epu8(x0, x4);
688 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
689 x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
690 x1 = _mm_avg_epu8(x1, x4);
692 x2 = _mm_maddubs_epi16(x0, u_factors);
693 x3 = _mm_maddubs_epi16(x1, u_factors);
694 x4 = _mm_maddubs_epi16(x0, v_factors);
695 __m128i x5 = _mm_maddubs_epi16(x1, v_factors);
697 x0 = _mm_hadd_epi16(x2, x3);
698 x1 = _mm_hadd_epi16(x4, x5);
700 x0 = _mm_srai_epi16(x0, U_SHIFT);
701 x1 = _mm_srai_epi16(x1, V_SHIFT);
703 x0 = _mm_packs_epi16(x0, x1);
705 x0 = _mm_sub_epi8(x0, vector128);
707 _mm_storel_pi(udst, _mm_castsi128_ps(x0));
709 _mm_storeh_pi(vdst, _mm_castsi128_ps(x0));
712 for (; x < width - width % 2; x += 2)
716 sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]);
717 sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]);
718 sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]);
719 sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]);
720 const INT16 u4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)u[0] + u[1] + u[2] + u[3]);
721 const INT16 uu = WINPR_ASSERTING_INT_CAST(INT16, u4 / 4);
722 const BYTE u8 = CLIP(uu);
725 const INT16 v4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)v[0] + v[1] + v[2] + v[3]);
726 const INT16 vu = WINPR_ASSERTING_INT_CAST(INT16, v4 / 4);
727 const BYTE v8 = CLIP(vu);
732static pstatus_t sse41_RGBToYUV420_BGRX(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
733 BYTE* WINPR_RESTRICT pDst[],
const UINT32 dstStep[],
736 if (roi->height < 1 || roi->width < 1)
738 return !PRIMITIVES_SUCCESS;
742 for (; y < roi->height - roi->height % 2; y += 2)
744 const BYTE* line1 = &pSrc[y * srcStep];
745 const BYTE* line2 = &pSrc[(1ULL + y) * srcStep];
746 BYTE* ydst1 = &pDst[0][y * dstStep[0]];
747 BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]];
748 BYTE* udst = &pDst[1][y / 2 * dstStep[1]];
749 BYTE* vdst = &pDst[2][y / 2 * dstStep[2]];
751 sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
752 sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width);
753 sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width);
756 for (; y < roi->height; y++)
758 const BYTE* line = &pSrc[y * srcStep];
759 BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]];
760 sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width);
763 return PRIMITIVES_SUCCESS;
766static pstatus_t sse41_RGBToYUV420(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
767 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
768 const UINT32 dstStep[],
const prim_size_t* WINPR_RESTRICT roi)
772 case PIXEL_FORMAT_BGRX32:
773 case PIXEL_FORMAT_BGRA32:
774 return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
777 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
785static INLINE
void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
786 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
787 BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
788 BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
789 BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
791 const __m128i* argbEven = (
const __m128i*)srcEven;
792 const __m128i* argbOdd = (
const __m128i*)srcOdd;
793 const __m128i y_factors = BGRX_Y_FACTORS;
794 const __m128i u_factors = BGRX_U_FACTORS;
795 const __m128i v_factors = BGRX_V_FACTORS;
796 const __m128i vector128 = CONST128_FACTORS;
799 for (; x < width - width % 16; x += 16)
802 const __m128i xe1 = LOAD_SI128(argbEven++);
803 const __m128i xe2 = LOAD_SI128(argbEven++);
804 const __m128i xe3 = LOAD_SI128(argbEven++);
805 const __m128i xe4 = LOAD_SI128(argbEven++);
806 const __m128i xo1 = LOAD_SI128(argbOdd++);
807 const __m128i xo2 = LOAD_SI128(argbOdd++);
808 const __m128i xo3 = LOAD_SI128(argbOdd++);
809 const __m128i xo4 = LOAD_SI128(argbOdd++);
812 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
813 _mm_maddubs_epi16(xe2, y_factors)),
815 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
816 _mm_maddubs_epi16(xe4, y_factors)),
818 const __m128i ye = _mm_packus_epi16(ye1, ye2);
819 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
820 _mm_maddubs_epi16(xo2, y_factors)),
822 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
823 _mm_maddubs_epi16(xo4, y_factors)),
825 const __m128i yo = _mm_packus_epi16(yo1, yo2);
827 STORE_SI128(b1Even, ye);
832 STORE_SI128(b1Odd, yo);
847 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
848 _mm_maddubs_epi16(xe2, u_factors)),
851 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
852 _mm_maddubs_epi16(xe4, u_factors)),
854 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
860 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
861 _mm_maddubs_epi16(xo2, u_factors)),
864 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
865 _mm_maddubs_epi16(xo4, u_factors)),
867 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
876 const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
877 const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
878 const __m128i hi = _mm_add_epi16(ueh, uoh);
879 const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
880 const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
881 const __m128i lo = _mm_add_epi16(uel, uol);
882 const __m128i added = _mm_hadd_epi16(lo, hi);
883 const __m128i avg16 = _mm_srai_epi16(added, 2);
884 const __m128i avg = _mm_packus_epi16(avg16, avg16);
885 _mm_storel_epi64((__m128i*)b2, avg);
890 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
891 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
892 const __m128i ud = _mm_shuffle_epi8(ue, mask);
893 _mm_storel_epi64((__m128i*)b2, ud);
907 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
908 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
909 const __m128i ude = _mm_shuffle_epi8(ue, mask);
910 _mm_storel_epi64((__m128i*)b6, ude);
925 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
926 _mm_maddubs_epi16(xe2, v_factors)),
929 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
930 _mm_maddubs_epi16(xe4, v_factors)),
932 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
938 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
939 _mm_maddubs_epi16(xo2, v_factors)),
942 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
943 _mm_maddubs_epi16(xo4, v_factors)),
945 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
954 const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
955 const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
956 const __m128i hi = _mm_add_epi16(veh, voh);
957 const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
958 const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
959 const __m128i lo = _mm_add_epi16(vel, vol);
960 const __m128i added = _mm_hadd_epi16(lo, hi);
961 const __m128i avg16 = _mm_srai_epi16(added, 2);
962 const __m128i avg = _mm_packus_epi16(avg16, avg16);
963 _mm_storel_epi64((__m128i*)b3, avg);
968 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
969 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
970 const __m128i vd = _mm_shuffle_epi8(ve, mask);
971 _mm_storel_epi64((__m128i*)b3, vd);
985 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
986 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
987 const __m128i vde = _mm_shuffle_epi8(ve, mask);
988 _mm_storel_epi64((__m128i*)b7, vde);
994 general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
998static pstatus_t sse41_RGBToAVC444YUV_BGRX(
const BYTE* WINPR_RESTRICT pSrc,
999 WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1000 BYTE* WINPR_RESTRICT pDst1[],
const UINT32 dst1Step[],
1001 BYTE* WINPR_RESTRICT pDst2[],
const UINT32 dst2Step[],
1004 if (roi->height < 1 || roi->width < 1)
1005 return !PRIMITIVES_SUCCESS;
1008 for (; y < roi->height - roi->height % 2; y += 2)
1010 const BYTE* srcEven = pSrc + y * srcStep;
1011 const BYTE* srcOdd = pSrc + (y + 1) * srcStep;
1012 const size_t i = y >> 1;
1013 const size_t n = (i & (size_t)~7) + i;
1014 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1015 BYTE* b1Odd = (b1Even + dst1Step[0]);
1016 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1017 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1018 BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
1019 BYTE* b5 = b4 + 8ULL * dst2Step[0];
1020 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1021 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1022 sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
1026 for (; y < roi->height; y++)
1028 const BYTE* srcEven = pSrc + y * srcStep;
1029 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1030 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1031 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1032 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1033 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1034 general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, NULL, b1Even, NULL, b2, b3, NULL, NULL,
1035 b6, b7, roi->width);
1038 return PRIMITIVES_SUCCESS;
1041static pstatus_t sse41_RGBToAVC444YUV(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1042 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1043 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1044 const UINT32 dst2Step[],
1049 case PIXEL_FORMAT_BGRX32:
1050 case PIXEL_FORMAT_BGRA32:
1051 return sse41_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1055 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1073static INLINE
void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
1074 const BYTE* WINPR_RESTRICT srcEven,
const BYTE* WINPR_RESTRICT srcOdd,
1075 BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
1076 BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
1077 BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
1078 BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
1079 BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
1080 BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
1082 const __m128i vector128 = CONST128_FACTORS;
1083 const __m128i* argbEven = (
const __m128i*)srcEven;
1084 const __m128i* argbOdd = (
const __m128i*)srcOdd;
1087 for (; x < width - width % 16; x += 16)
1092 const __m128i xe1 = LOAD_SI128(argbEven++);
1093 const __m128i xe2 = LOAD_SI128(argbEven++);
1094 const __m128i xe3 = LOAD_SI128(argbEven++);
1095 const __m128i xe4 = LOAD_SI128(argbEven++);
1096 const __m128i xo1 = LOAD_SI128(argbOdd++);
1097 const __m128i xo2 = LOAD_SI128(argbOdd++);
1098 const __m128i xo3 = LOAD_SI128(argbOdd++);
1099 const __m128i xo4 = LOAD_SI128(argbOdd++);
1102 const __m128i y_factors = BGRX_Y_FACTORS;
1103 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
1104 _mm_maddubs_epi16(xe2, y_factors)),
1106 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
1107 _mm_maddubs_epi16(xe4, y_factors)),
1109 const __m128i ye = _mm_packus_epi16(ye1, ye2);
1111 STORE_SI128(yLumaDstEven, ye);
1117 const __m128i y_factors = BGRX_Y_FACTORS;
1118 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
1119 _mm_maddubs_epi16(xo2, y_factors)),
1121 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
1122 _mm_maddubs_epi16(xo4, y_factors)),
1124 const __m128i yo = _mm_packus_epi16(yo1, yo2);
1125 STORE_SI128(yLumaDstOdd, yo);
1141 const __m128i u_factors = BGRX_U_FACTORS;
1143 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
1144 _mm_maddubs_epi16(xe2, u_factors)),
1147 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
1148 _mm_maddubs_epi16(xe4, u_factors)),
1150 const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
1151 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
1155 const __m128i u_factors = BGRX_U_FACTORS;
1157 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
1158 _mm_maddubs_epi16(xo2, u_factors)),
1161 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
1162 _mm_maddubs_epi16(xo4, u_factors)),
1164 const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
1165 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
1166 uavg = _mm_add_epi16(uavg, uoavg);
1167 uavg = _mm_srai_epi16(uavg, 2);
1168 uavg = _mm_packs_epi16(uavg, uoavg);
1169 uavg = _mm_sub_epi8(uavg, vector128);
1177 const __m128i mask =
1178 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1179 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1180 const __m128i ude = _mm_shuffle_epi8(ue, mask);
1181 _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
1182 yEvenChromaDst1 += 8;
1187 const __m128i mask =
1188 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1189 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1190 const __m128i udo = _mm_shuffle_epi8(uo, mask);
1191 _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
1192 yOddChromaDst1 += 8;
1197 const __m128i mask =
1198 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1199 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1200 const __m128i ud = _mm_shuffle_epi8(uo, mask);
1201 int* uDst1 = (
int*)uChromaDst1;
1202 int* vDst1 = (
int*)vChromaDst1;
1203 const int* src = (
const int*)&ud;
1204 _mm_stream_si32(uDst1, src[0]);
1205 _mm_stream_si32(vDst1, src[1]);
1212 _mm_storel_epi64((__m128i*)uLumaDst, uavg);
1217 const __m128i mask =
1218 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1219 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1220 const __m128i ud = _mm_shuffle_epi8(ue, mask);
1221 _mm_storel_epi64((__m128i*)uLumaDst, ud);
1232 const __m128i v_factors = BGRX_V_FACTORS;
1234 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
1235 _mm_maddubs_epi16(xe2, v_factors)),
1238 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
1239 _mm_maddubs_epi16(xe4, v_factors)),
1241 const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
1242 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
1246 const __m128i v_factors = BGRX_V_FACTORS;
1248 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
1249 _mm_maddubs_epi16(xo2, v_factors)),
1252 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
1253 _mm_maddubs_epi16(xo4, v_factors)),
1255 const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
1256 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
1257 vavg = _mm_add_epi16(vavg, voavg);
1258 vavg = _mm_srai_epi16(vavg, 2);
1259 vavg = _mm_packs_epi16(vavg, voavg);
1260 vavg = _mm_sub_epi8(vavg, vector128);
1268 const __m128i mask =
1269 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1270 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1271 __m128i vde = _mm_shuffle_epi8(ve, mask);
1272 _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
1273 yEvenChromaDst2 += 8;
1278 const __m128i mask =
1279 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1280 (
char)0x80, (
char)0x80, (
char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1281 __m128i vdo = _mm_shuffle_epi8(vo, mask);
1282 _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
1283 yOddChromaDst2 += 8;
1288 const __m128i mask =
1289 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1290 (
char)0x80, (
char)0x80, (
char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1291 const __m128i vd = _mm_shuffle_epi8(vo, mask);
1292 int* uDst2 = (
int*)uChromaDst2;
1293 int* vDst2 = (
int*)vChromaDst2;
1294 const int* src = (
const int*)&vd;
1295 _mm_stream_si32(uDst2, src[0]);
1296 _mm_stream_si32(vDst2, src[1]);
1303 _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1308 const __m128i mask =
1309 _mm_set_epi8((
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80, (
char)0x80,
1310 (
char)0x80, (
char)0x80, (
char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1311 __m128i vd = _mm_shuffle_epi8(ve, mask);
1312 _mm_storel_epi64((__m128i*)vLumaDst, vd);
1318 general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd,
1319 uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2,
1320 yOddChromaDst1, yOddChromaDst2, uChromaDst1,
1321 uChromaDst2, vChromaDst1, vChromaDst2, width);
1324static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(
const BYTE* WINPR_RESTRICT pSrc,
1325 WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1326 BYTE* WINPR_RESTRICT pDst1[],
const UINT32 dst1Step[],
1327 BYTE* WINPR_RESTRICT pDst2[],
const UINT32 dst2Step[],
1330 if (roi->height < 1 || roi->width < 1)
1331 return !PRIMITIVES_SUCCESS;
1334 for (; y < roi->height - roi->height % 2; y += 2)
1336 const BYTE* srcEven = (pSrc + y * srcStep);
1337 const BYTE* srcOdd = (srcEven + srcStep);
1338 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1339 BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
1340 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1341 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1342 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1343 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1344 BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1345 BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1346 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1347 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1348 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1349 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1350 sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1351 dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1352 dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1353 dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1356 for (; y < roi->height; y++)
1358 const BYTE* srcEven = (pSrc + y * srcStep);
1359 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1360 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1361 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1362 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1363 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1364 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1365 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1366 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1367 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1368 general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(0, srcEven, NULL, dstLumaYEven, NULL, dstLumaU,
1369 dstLumaV, dstEvenChromaY1, dstEvenChromaY2, NULL,
1370 NULL, dstChromaU1, dstChromaU2, dstChromaV1,
1371 dstChromaV2, roi->width);
1374 return PRIMITIVES_SUCCESS;
1377static pstatus_t sse41_RGBToAVC444YUVv2(
const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1378 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1379 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1380 const UINT32 dst2Step[],
1385 case PIXEL_FORMAT_BGRX32:
1386 case PIXEL_FORMAT_BGRA32:
1387 return sse41_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1391 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1396static pstatus_t sse41_LumaToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[],
const UINT32 srcStep[],
1397 BYTE* WINPR_RESTRICT pDstRaw[],
const UINT32 dstStep[],
1400 const UINT32 nWidth = roi->right - roi->left;
1401 const UINT32 nHeight = roi->bottom - roi->top;
1402 const UINT32 halfWidth = (nWidth + 1) / 2;
1403 const UINT32 halfPad = halfWidth % 16;
1404 const UINT32 halfHeight = (nHeight + 1) / 2;
1405 const UINT32 oddY = 1;
1406 const UINT32 evenY = 0;
1407 const UINT32 oddX = 1;
1408 const UINT32 evenX = 0;
1409 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1410 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1411 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1412 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1413 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1414 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1418 for (
size_t y = 0; y < nHeight; y++)
1420 const BYTE* Ym = pSrc[0] + y * srcStep[0];
1421 BYTE* pY = pDst[0] + y * dstStep[0];
1422 memcpy(pY, Ym, nWidth);
1427 for (
size_t y = 0; y < halfHeight; y++)
1429 const size_t val2y = (2 * y + evenY);
1430 const size_t val2y1 = val2y + oddY;
1431 const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1432 const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1433 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1434 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1435 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1436 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1439 for (; x < halfWidth - halfPad; x += 16)
1441 const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1442 const __m128i unpackLow =
1443 _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1445 const __m128i u = LOAD_SI128(&Um[x]);
1446 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1447 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1448 STORE_SI128(&pU[2ULL * x], uHigh);
1449 STORE_SI128(&pU[2ULL * x + 16], uLow);
1450 STORE_SI128(&pU1[2ULL * x], uHigh);
1451 STORE_SI128(&pU1[2ULL * x + 16], uLow);
1454 const __m128i u = LOAD_SI128(&Vm[x]);
1455 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1456 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1457 STORE_SI128(&pV[2 * x], uHigh);
1458 STORE_SI128(&pV[2 * x + 16], uLow);
1459 STORE_SI128(&pV1[2 * x], uHigh);
1460 STORE_SI128(&pV1[2 * x + 16], uLow);
1464 for (; x < halfWidth; x++)
1466 const size_t val2x = 2 * x + evenX;
1467 const size_t val2x1 = val2x + oddX;
1474 pU1[val2x1] = Um[x];
1475 pV1[val2x1] = Vm[x];
1479 return PRIMITIVES_SUCCESS;
1482static pstatus_t sse41_ChromaV1ToYUV444(
const BYTE* WINPR_RESTRICT pSrcRaw[3],
1483 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1484 const UINT32 dstStep[3],
1487 const UINT32 mod = 16;
1490 const UINT32 nWidth = roi->right - roi->left;
1491 const UINT32 nHeight = roi->bottom - roi->top;
1492 const UINT32 halfWidth = (nWidth + 1) / 2;
1493 const UINT32 halfPad = halfWidth % 16;
1494 const UINT32 halfHeight = (nHeight + 1) / 2;
1495 const UINT32 oddY = 1;
1496 const UINT32 evenY = 0;
1497 const UINT32 oddX = 1;
1500 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1501 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1502 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1503 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1504 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1505 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1506 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1507 const __m128i zero = _mm_setzero_si128();
1508 const __m128i mask = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1509 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1513 for (
size_t y = 0; y < padHeigth; y++)
1515 const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1518 if ((y) % mod < (mod + 1) / 2)
1520 const UINT32 pos = (2 * uY++ + oddY);
1525 pX = pDst[1] + 1ULL * dstStep[1] * pos;
1529 const UINT32 pos = (2 * vY++ + oddY);
1534 pX = pDst[2] + 1ULL * dstStep[2] * pos;
1537 memcpy(pX, Ya, nWidth);
1541 for (
size_t y = 0; y < halfHeight; y++)
1543 const size_t val2y = (y * 2 + evenY);
1544 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1545 const BYTE* Va = pSrc[2] + srcStep[2] * y;
1546 BYTE* pU = pDst[1] + dstStep[1] * val2y;
1547 BYTE* pV = pDst[2] + dstStep[2] * val2y;
1550 for (; x < halfWidth - halfPad; x += 16)
1553 const __m128i u = LOAD_SI128(&Ua[x]);
1554 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1555 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1556 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1557 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1560 const __m128i u = LOAD_SI128(&Va[x]);
1561 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1562 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1563 _mm_maskmoveu_si128(u1, mask, (
char*)&pV[2 * x]);
1564 _mm_maskmoveu_si128(u2, mask, (
char*)&pV[2 * x + 16]);
1568 for (; x < halfWidth; x++)
1570 const size_t val2x1 = (x * 2ULL + oddX);
1576 return PRIMITIVES_SUCCESS;
1579static pstatus_t sse41_ChromaV2ToYUV444(
const BYTE* WINPR_RESTRICT pSrc[3],
const UINT32 srcStep[3],
1580 UINT32 nTotalWidth, WINPR_ATTR_UNUSED UINT32 nTotalHeight,
1581 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1584 const UINT32 nWidth = roi->right - roi->left;
1585 const UINT32 nHeight = roi->bottom - roi->top;
1586 const UINT32 halfWidth = (nWidth + 1) / 2;
1587 const UINT32 halfPad = halfWidth % 16;
1588 const UINT32 halfHeight = (nHeight + 1) / 2;
1589 const UINT32 quaterWidth = (nWidth + 3) / 4;
1590 const UINT32 quaterPad = quaterWidth % 16;
1591 const __m128i zero = _mm_setzero_si128();
1592 const __m128i mask = _mm_set_epi8((
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0,
1593 (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0);
1594 const __m128i mask2 = _mm_set_epi8(0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80,
1595 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80, 0, (
char)0x80);
1596 const __m128i shuffle1 =
1597 _mm_set_epi8((
char)0x80, 15, (
char)0x80, 14, (
char)0x80, 13, (
char)0x80, 12, (
char)0x80, 11,
1598 (
char)0x80, 10, (
char)0x80, 9, (
char)0x80, 8);
1599 const __m128i shuffle2 =
1600 _mm_set_epi8((
char)0x80, 7, (
char)0x80, 6, (
char)0x80, 5, (
char)0x80, 4, (
char)0x80, 3,
1601 (
char)0x80, 2, (
char)0x80, 1, (
char)0x80, 0);
1604 for (
size_t y = 0; y < nHeight; y++)
1606 const size_t yTop = y + roi->top;
1607 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1608 const BYTE* pYaV = pYaU + nTotalWidth / 2;
1609 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1610 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1613 for (; x < halfWidth - halfPad; x += 16)
1616 const __m128i u = LOAD_SI128(&pYaU[x]);
1617 const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1618 const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1619 _mm_maskmoveu_si128(u1, mask, (
char*)&pU[2 * x]);
1620 _mm_maskmoveu_si128(u2, mask, (
char*)&pU[2 * x + 16]);
1623 const __m128i v = LOAD_SI128(&pYaV[x]);
1624 const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1625 const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1626 _mm_maskmoveu_si128(v1, mask, (
char*)&pV[2 * x]);
1627 _mm_maskmoveu_si128(v2, mask, (
char*)&pV[2 * x + 16]);
1631 for (; x < halfWidth; x++)
1633 const size_t odd = 2ULL * x + 1;
1640 for (
size_t y = 0; y < halfHeight; y++)
1642 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1643 const BYTE* pUaV = pUaU + nTotalWidth / 4;
1644 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1645 const BYTE* pVaV = pVaU + nTotalWidth / 4;
1646 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1647 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1650 for (; x < quaterWidth - quaterPad; x += 16)
1653 const __m128i uU = LOAD_SI128(&pUaU[x]);
1654 const __m128i uV = LOAD_SI128(&pVaU[x]);
1655 const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1656 const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1657 const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1658 const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1659 const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1660 const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1661 _mm_maskmoveu_si128(u1, mask2, (
char*)&pU[4 * x + 0]);
1662 _mm_maskmoveu_si128(u2, mask2, (
char*)&pU[4 * x + 16]);
1663 _mm_maskmoveu_si128(u3, mask2, (
char*)&pU[4 * x + 32]);
1664 _mm_maskmoveu_si128(u4, mask2, (
char*)&pU[4 * x + 48]);
1667 const __m128i vU = LOAD_SI128(&pUaV[x]);
1668 const __m128i vV = LOAD_SI128(&pVaV[x]);
1669 const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1670 const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1671 const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1672 const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1673 const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1674 const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1675 _mm_maskmoveu_si128(v1, mask2, (
char*)&pV[4 * x + 0]);
1676 _mm_maskmoveu_si128(v2, mask2, (
char*)&pV[4 * x + 16]);
1677 _mm_maskmoveu_si128(v3, mask2, (
char*)&pV[4 * x + 32]);
1678 _mm_maskmoveu_si128(v4, mask2, (
char*)&pV[4 * x + 48]);
1682 for (; x < quaterWidth; x++)
1684 pU[4 * x + 0] = pUaU[x];
1685 pV[4 * x + 0] = pUaV[x];
1686 pU[4 * x + 2] = pVaU[x];
1687 pV[4 * x + 2] = pVaV[x];
1691 return PRIMITIVES_SUCCESS;
1694static pstatus_t sse41_YUV420CombineToYUV444(avc444_frame_type type,
1695 const BYTE* WINPR_RESTRICT pSrc[3],
1696 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1697 BYTE* WINPR_RESTRICT pDst[3],
const UINT32 dstStep[3],
1700 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1703 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1712 return sse41_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1714 case AVC444_CHROMAv1:
1715 return sse41_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1717 case AVC444_CHROMAv2:
1718 return sse41_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1726void primitives_init_YUV_sse41_int(
primitives_t* WINPR_RESTRICT prims)
1728#if defined(SSE_AVX_INTRINSICS_ENABLED)
1729 generic = primitives_get_generic();
1731 WLog_VRB(PRIM_TAG,
"SSE3/sse41 optimizations");
1732 prims->RGBToYUV420_8u_P3AC4R = sse41_RGBToYUV420;
1733 prims->RGBToAVC444YUV = sse41_RGBToAVC444YUV;
1734 prims->RGBToAVC444YUVv2 = sse41_RGBToAVC444YUVv2;
1735 prims->YUV420ToRGB_8u_P3AC4R = sse41_YUV420ToRGB;
1736 prims->YUV444ToRGB_8u_P3AC4R = sse41_YUV444ToRGB_8u_P3AC4R;
1737 prims->YUV420CombineToYUV444 = sse41_YUV420CombineToYUV444;
1739 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or sse41 intrinsics not available");
1740 WINPR_UNUSED(prims);