FreeRDP
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Modules Pages
prim_YUV_sse4.1.c
1
23#include <winpr/wtypes.h>
24#include <freerdp/config.h>
25
26#include <winpr/sysinfo.h>
27#include <winpr/crt.h>
28#include <freerdp/types.h>
29#include <freerdp/primitives.h>
30
31#include "prim_internal.h"
32#include "prim_avxsse.h"
33#include "prim_YUV.h"
34
35#if defined(SSE_AVX_INTRINSICS_ENABLED)
36#include <emmintrin.h>
37#include <tmmintrin.h>
38#include <smmintrin.h>
39
40static primitives_t* generic = NULL;
41
42/****************************************************************************/
43/* sse41 YUV420 -> RGB conversion */
44/****************************************************************************/
45static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
46 __m128i Vraw, UINT8 pos)
47{
48 const __m128i mapY[] = { mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
49 mm_set_epu32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
50 mm_set_epu32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
51 mm_set_epu32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
52 const __m128i mapUV[] = { mm_set_epu32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
53 mm_set_epu32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
54 mm_set_epu32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
55 mm_set_epu32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
56 const __m128i mask[] = { mm_set_epu32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
57 mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
58 mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
59 const __m128i c128 = _mm_set1_epi16(128);
60 __m128i BGRX = _mm_and_si128(LOAD_SI128(dst),
61 mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
62 {
63 __m128i C;
64 __m128i D;
65 __m128i E;
66 /* Load Y values and expand to 32 bit */
67 {
68 C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
69 }
70 /* Load U values and expand to 32 bit */
71 {
72 const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
73 D = _mm_sub_epi16(U, c128); /* D = U - 128 */
74 }
75 /* Load V values and expand to 32 bit */
76 {
77 const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
78 E = _mm_sub_epi16(V, c128); /* E = V - 128 */
79 }
80 /* Get the R value */
81 {
82 const __m128i c403 = _mm_set1_epi16(403);
83 const __m128i e403 =
84 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
85 const __m128i Rs = _mm_add_epi32(C, e403);
86 const __m128i R32 = _mm_srai_epi32(Rs, 8);
87 const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
88 const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
89 const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
90 BGRX = _mm_or_si128(BGRX, packed);
91 }
92 /* Get the G value */
93 {
94 const __m128i c48 = _mm_set1_epi16(48);
95 const __m128i d48 =
96 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
97 const __m128i c120 = _mm_set1_epi16(120);
98 const __m128i e120 =
99 _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
100 const __m128i de = _mm_add_epi32(d48, e120);
101 const __m128i Gs = _mm_sub_epi32(C, de);
102 const __m128i G32 = _mm_srai_epi32(Gs, 8);
103 const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
104 const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
105 const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
106 BGRX = _mm_or_si128(BGRX, packed);
107 }
108 /* Get the B value */
109 {
110 const __m128i c475 = _mm_set1_epi16(475);
111 const __m128i d475 =
112 _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
113 const __m128i Bs = _mm_add_epi32(C, d475);
114 const __m128i B32 = _mm_srai_epi32(Bs, 8);
115 const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
116 const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
117 const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
118 BGRX = _mm_or_si128(BGRX, packed);
119 }
120 }
121 STORE_SI128(dst++, BGRX);
122 return dst;
123}
124
125static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
126 const UINT32* WINPR_RESTRICT srcStep,
127 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
128 const prim_size_t* WINPR_RESTRICT roi)
129{
130 const UINT32 nWidth = roi->width;
131 const UINT32 nHeight = roi->height;
132 const UINT32 pad = roi->width % 16;
133 const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
134
135 for (size_t y = 0; y < nHeight; y++)
136 {
137 __m128i* dst = (__m128i*)(pDst + dstStep * y);
138 const BYTE* YData = pSrc[0] + y * srcStep[0];
139 const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
140 const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
141
142 for (UINT32 x = 0; x < nWidth - pad; x += 16)
143 {
144 const __m128i Y = LOAD_SI128(YData);
145 const __m128i uRaw = LOAD_SI128(UData);
146 const __m128i vRaw = LOAD_SI128(VData);
147 const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
148 const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
149 YData += 16;
150 UData += 8;
151 VData += 8;
152 dst = sse41_YUV444Pixel(dst, Y, U, V, 0);
153 dst = sse41_YUV444Pixel(dst, Y, U, V, 1);
154 dst = sse41_YUV444Pixel(dst, Y, U, V, 2);
155 dst = sse41_YUV444Pixel(dst, Y, U, V, 3);
156 }
157
158 for (UINT32 x = 0; x < pad; x++)
159 {
160 const BYTE Y = *YData++;
161 const BYTE U = *UData;
162 const BYTE V = *VData;
163 dst = (__m128i*)writeYUVPixel((BYTE*)dst, PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
164
165 if (x % 2)
166 {
167 UData++;
168 VData++;
169 }
170 }
171 }
172
173 return PRIMITIVES_SUCCESS;
174}
175
176static pstatus_t sse41_YUV420ToRGB(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
177 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
178 const prim_size_t* WINPR_RESTRICT roi)
179{
180 switch (DstFormat)
181 {
182 case PIXEL_FORMAT_BGRX32:
183 case PIXEL_FORMAT_BGRA32:
184 return sse41_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
185
186 default:
187 return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
188 }
189}
190
191static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
192 const BYTE* WINPR_RESTRICT pY[2], const BYTE* WINPR_RESTRICT pU[2],
193 const BYTE* WINPR_RESTRICT pV[2], BOOL filter)
194{
195 WINPR_ASSERT(pRGB);
196 WINPR_ASSERT(pY);
197 WINPR_ASSERT(pU);
198 WINPR_ASSERT(pV);
199
200 const UINT32 DstFormat = PIXEL_FORMAT_BGRX32;
201 const UINT32 bpp = 4;
202
203 for (size_t i = 0; i < 2; i++)
204 {
205 for (size_t j = 0; j < 2; j++)
206 {
207 const BYTE Y = pY[i][offset + j];
208 BYTE U = pU[i][offset + j];
209 BYTE V = pV[i][offset + j];
210 if ((i == 0) && (j == 0) && filter)
211 {
212 const INT32 avgU =
213 4 * pU[0][offset] - pU[0][offset + 1] - pU[1][offset] - pU[1][offset + 1];
214 const INT32 avgV =
215 4 * pV[0][offset] - pV[0][offset + 1] - pV[1][offset] - pV[1][offset + 1];
216
217 U = CONDITIONAL_CLIP(avgU, pU[0][offset]);
218 V = CONDITIONAL_CLIP(avgV, pV[0][offset]);
219 }
220
221 writeYUVPixel(&pRGB[i][(j + offset) * bpp], DstFormat, Y, U, V, writePixelBGRX);
222 }
223 }
224}
225
226/* input are uint16_t vectors */
227static inline __m128i sse41_yuv2x_single(const __m128i Y, __m128i U, __m128i V, const short iMulU,
228 const short iMulV)
229{
230 const __m128i zero = _mm_set1_epi8(0);
231
232 __m128i Ylo = _mm_unpacklo_epi16(Y, zero);
233 __m128i Yhi = _mm_unpackhi_epi16(Y, zero);
234 if (iMulU != 0)
235 {
236 const __m128i addX = _mm_set1_epi16(128);
237 const __m128i D = _mm_sub_epi16(U, addX);
238 const __m128i mulU = _mm_set1_epi16(iMulU);
239 const __m128i mulDlo = _mm_mullo_epi16(D, mulU);
240 const __m128i mulDhi = _mm_mulhi_epi16(D, mulU);
241 const __m128i Dlo = _mm_unpacklo_epi16(mulDlo, mulDhi);
242 Ylo = _mm_add_epi32(Ylo, Dlo);
243
244 const __m128i Dhi = _mm_unpackhi_epi16(mulDlo, mulDhi);
245 Yhi = _mm_add_epi32(Yhi, Dhi);
246 }
247 if (iMulV != 0)
248 {
249 const __m128i addX = _mm_set1_epi16(128);
250 const __m128i E = _mm_sub_epi16(V, addX);
251 const __m128i mul = _mm_set1_epi16(iMulV);
252 const __m128i mulElo = _mm_mullo_epi16(E, mul);
253 const __m128i mulEhi = _mm_mulhi_epi16(E, mul);
254 const __m128i Elo = _mm_unpacklo_epi16(mulElo, mulEhi);
255 const __m128i esumlo = _mm_add_epi32(Ylo, Elo);
256
257 const __m128i Ehi = _mm_unpackhi_epi16(mulElo, mulEhi);
258 const __m128i esumhi = _mm_add_epi32(Yhi, Ehi);
259 Ylo = esumlo;
260 Yhi = esumhi;
261 }
262
263 const __m128i rYlo = _mm_srai_epi32(Ylo, 8);
264 const __m128i rYhi = _mm_srai_epi32(Yhi, 8);
265 const __m128i rY = _mm_packs_epi32(rYlo, rYhi);
266 return rY;
267}
268
269/* Input are uint8_t vectors */
270static inline __m128i sse41_yuv2x(const __m128i Y, __m128i U, __m128i V, const short iMulU,
271 const short iMulV)
272{
273 const __m128i zero = _mm_set1_epi8(0);
274
275 /* Ylo = Y * 256
276 * Ulo = uint8_t -> uint16_t
277 * Vlo = uint8_t -> uint16_t
278 */
279 const __m128i Ylo = _mm_unpacklo_epi8(zero, Y);
280 const __m128i Ulo = _mm_unpacklo_epi8(U, zero);
281 const __m128i Vlo = _mm_unpacklo_epi8(V, zero);
282 const __m128i preslo = sse41_yuv2x_single(Ylo, Ulo, Vlo, iMulU, iMulV);
283
284 const __m128i Yhi = _mm_unpackhi_epi8(zero, Y);
285 const __m128i Uhi = _mm_unpackhi_epi8(U, zero);
286 const __m128i Vhi = _mm_unpackhi_epi8(V, zero);
287 const __m128i preshi = sse41_yuv2x_single(Yhi, Uhi, Vhi, iMulU, iMulV);
288 const __m128i res = _mm_packus_epi16(preslo, preshi);
289
290 return res;
291}
292
293/* const INT32 r = ((256L * C(Y) + 0L * D(U) + 403L * E(V))) >> 8; */
294static inline __m128i sse41_yuv2r(const __m128i Y, __m128i U, __m128i V)
295{
296 return sse41_yuv2x(Y, U, V, 0, 403);
297}
298
299/* const INT32 g = ((256L * C(Y) - 48L * D(U) - 120L * E(V))) >> 8; */
300static inline __m128i sse41_yuv2g(const __m128i Y, __m128i U, __m128i V)
301{
302 return sse41_yuv2x(Y, U, V, -48, -120);
303}
304
305/* const INT32 b = ((256L * C(Y) + 475L * D(U) + 0L * E(V))) >> 8; */
306static inline __m128i sse41_yuv2b(const __m128i Y, __m128i U, __m128i V)
307{
308 return sse41_yuv2x(Y, U, V, 475, 0);
309}
310
311static inline void sse41_BGRX_fillRGB_pixel(BYTE* WINPR_RESTRICT pRGB, __m128i Y, __m128i U,
312 __m128i V)
313{
314 const __m128i zero = _mm_set1_epi8(0);
315 /* Y * 256 */
316 const __m128i r = sse41_yuv2r(Y, U, V);
317 const __m128i rx[2] = { _mm_unpackhi_epi8(r, zero), _mm_unpacklo_epi8(r, zero) };
318
319 const __m128i g = sse41_yuv2g(Y, U, V);
320 const __m128i b = sse41_yuv2b(Y, U, V);
321
322 const __m128i bg[2] = { _mm_unpackhi_epi8(b, g), _mm_unpacklo_epi8(b, g) };
323
324 const __m128i mask = mm_set_epu8(0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF,
325 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF);
326
327 __m128i* rgb = (__m128i*)pRGB;
328 const __m128i bgrx0 = _mm_unpacklo_epi16(bg[1], rx[1]);
329 _mm_maskmoveu_si128(bgrx0, mask, (char*)&rgb[0]);
330 const __m128i bgrx1 = _mm_unpackhi_epi16(bg[1], rx[1]);
331 _mm_maskmoveu_si128(bgrx1, mask, (char*)&rgb[1]);
332 const __m128i bgrx2 = _mm_unpacklo_epi16(bg[0], rx[0]);
333 _mm_maskmoveu_si128(bgrx2, mask, (char*)&rgb[2]);
334 const __m128i bgrx3 = _mm_unpackhi_epi16(bg[0], rx[0]);
335 _mm_maskmoveu_si128(bgrx3, mask, (char*)&rgb[3]);
336}
337
338static inline __m128i odd1sum(__m128i u1)
339{
340 const __m128i zero = _mm_set1_epi8(0);
341 const __m128i u1hi = _mm_unpackhi_epi8(u1, zero);
342 const __m128i u1lo = _mm_unpacklo_epi8(u1, zero);
343 return _mm_hadds_epi16(u1lo, u1hi);
344}
345
346static inline __m128i odd0sum(__m128i u0, __m128i u1sum)
347{
348 /* Mask out even bytes, extend uint8_t to uint16_t by filling in zero bytes,
349 * horizontally add the values */
350 const __m128i mask = mm_set_epu8(0x80, 0x0F, 0x80, 0x0D, 0x80, 0x0B, 0x80, 0x09, 0x80, 0x07,
351 0x80, 0x05, 0x80, 0x03, 0x80, 0x01);
352 const __m128i u0odd = _mm_shuffle_epi8(u0, mask);
353 return _mm_adds_epi16(u1sum, u0odd);
354}
355
356static inline __m128i calcavg(__m128i u0even, __m128i sum)
357{
358 const __m128i u4zero = _mm_slli_epi16(u0even, 2);
359 const __m128i uavg = _mm_sub_epi16(u4zero, sum);
360 const __m128i zero = _mm_set1_epi8(0);
361 const __m128i savg = _mm_packus_epi16(uavg, zero);
362 const __m128i smask = mm_set_epu8(0x80, 0x07, 0x80, 0x06, 0x80, 0x05, 0x80, 0x04, 0x80, 0x03,
363 0x80, 0x02, 0x80, 0x01, 0x80, 0x00);
364 return _mm_shuffle_epi8(savg, smask);
365}
366
367static inline __m128i diffmask(__m128i avg, __m128i u0even)
368{
369 /* Check for values >= 30 to apply the avg value to
370 * use int16 for calculations to avoid issues with signed 8bit integers
371 */
372 const __m128i diff = _mm_subs_epi16(u0even, avg);
373 const __m128i absdiff = _mm_abs_epi16(diff);
374 const __m128i val30 = _mm_set1_epi16(30);
375 return _mm_cmplt_epi16(absdiff, val30);
376}
377
378static inline void sse41_filter(__m128i pU[2])
379{
380 const __m128i u1sum = odd1sum(pU[1]);
381 const __m128i sum = odd0sum(pU[0], u1sum);
382
383 /* Mask out the odd bytes. We don´t need to do anything to make the uint8_t to uint16_t */
384 const __m128i emask = mm_set_epu8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
385 0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
386 const __m128i u0even = _mm_and_si128(pU[0], emask);
387 const __m128i avg = calcavg(u0even, sum);
388 const __m128i umask = diffmask(avg, u0even);
389
390 const __m128i u0orig = _mm_and_si128(u0even, umask);
391 const __m128i u0avg = _mm_andnot_si128(umask, avg);
392 const __m128i evenresult = _mm_or_si128(u0orig, u0avg);
393 const __m128i omask = mm_set_epu8(0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00,
394 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00);
395 const __m128i u0odd = _mm_and_si128(pU[0], omask);
396 const __m128i result = _mm_or_si128(evenresult, u0odd);
397 pU[0] = result;
398}
399
400static inline void sse41_BGRX_fillRGB(BYTE* WINPR_RESTRICT pRGB[2], const __m128i pY[2],
401 __m128i pU[2], __m128i pV[2])
402{
403 WINPR_ASSERT(pRGB);
404 WINPR_ASSERT(pY);
405 WINPR_ASSERT(pU);
406 WINPR_ASSERT(pV);
407
408 sse41_filter(pU);
409 sse41_filter(pV);
410
411 for (size_t i = 0; i < 2; i++)
412 {
413 sse41_BGRX_fillRGB_pixel(pRGB[i], pY[i], pU[i], pV[i]);
414 }
415}
416
417static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
418 BYTE* WINPR_RESTRICT pDst[2], const BYTE* WINPR_RESTRICT YData[2],
419 const BYTE* WINPR_RESTRICT UData[2], const BYTE* WINPR_RESTRICT VData[2], UINT32 nWidth)
420{
421 WINPR_ASSERT((nWidth % 2) == 0);
422 const UINT32 pad = nWidth % 16;
423
424 size_t x = 0;
425 for (; x < nWidth - pad; x += 16)
426 {
427 const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) };
428 __m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) };
429 __m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) };
430
431 BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
432 sse41_BGRX_fillRGB(dstp, Y, U, V);
433 }
434
435 for (; x < nWidth; x += 2)
436 {
437 BGRX_fillRGB(x, pDst, YData, UData, VData, TRUE);
438 }
439
440 return PRIMITIVES_SUCCESS;
441}
442
443static inline void BGRX_fillRGB_single(size_t offset, BYTE* WINPR_RESTRICT pRGB,
444 const BYTE* WINPR_RESTRICT pY, const BYTE* WINPR_RESTRICT pU,
445 const BYTE* WINPR_RESTRICT pV, WINPR_ATTR_UNUSED BOOL filter)
446{
447 WINPR_ASSERT(pRGB);
448 WINPR_ASSERT(pY);
449 WINPR_ASSERT(pU);
450 WINPR_ASSERT(pV);
451
452 const UINT32 bpp = 4;
453
454 for (size_t j = 0; j < 2; j++)
455 {
456 const BYTE Y = pY[offset + j];
457 BYTE U = pU[offset + j];
458 BYTE V = pV[offset + j];
459
460 writeYUVPixel(&pRGB[(j + offset) * bpp], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
461 }
462}
463
464static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(
465 BYTE* WINPR_RESTRICT pDst, const BYTE* WINPR_RESTRICT YData, const BYTE* WINPR_RESTRICT UData,
466 const BYTE* WINPR_RESTRICT VData, UINT32 nWidth)
467{
468 WINPR_ASSERT((nWidth % 2) == 0);
469
470 for (size_t x = 0; x < nWidth; x += 2)
471 {
472 BGRX_fillRGB_single(x, pDst, YData, UData, VData, TRUE);
473 }
474
475 return PRIMITIVES_SUCCESS;
476}
477
478static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* WINPR_RESTRICT pSrc[],
479 const UINT32 srcStep[],
480 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
481 const prim_size_t* WINPR_RESTRICT roi)
482{
483 const UINT32 nWidth = roi->width;
484 const UINT32 nHeight = roi->height;
485
486 size_t y = 0;
487 for (; y < nHeight - nHeight % 2; y += 2)
488 {
489 BYTE* dst[] = { (pDst + dstStep * y), (pDst + dstStep * (y + 1)) };
490 const BYTE* YData[] = { pSrc[0] + y * srcStep[0], pSrc[0] + (y + 1) * srcStep[0] };
491 const BYTE* UData[] = { pSrc[1] + y * srcStep[1], pSrc[1] + (y + 1) * srcStep[1] };
492 const BYTE* VData[] = { pSrc[2] + y * srcStep[2], pSrc[2] + (y + 1) * srcStep[2] };
493
494 const pstatus_t rc =
495 sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(dst, YData, UData, VData, nWidth);
496 if (rc != PRIMITIVES_SUCCESS)
497 return rc;
498 }
499 for (; y < nHeight; y++)
500 {
501 BYTE* dst = (pDst + dstStep * y);
502 const BYTE* YData = pSrc[0] + y * srcStep[0];
503 const BYTE* UData = pSrc[1] + y * srcStep[1];
504 const BYTE* VData = pSrc[2] + y * srcStep[2];
505 const pstatus_t rc =
506 sse41_YUV444ToRGB_8u_P3AC4R_BGRX_SINGLE_ROW(dst, YData, UData, VData, nWidth);
507 if (rc != PRIMITIVES_SUCCESS)
508 return rc;
509 }
510
511 return PRIMITIVES_SUCCESS;
512}
513
514static pstatus_t sse41_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[],
515 const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
516 UINT32 dstStep, UINT32 DstFormat,
517 const prim_size_t* WINPR_RESTRICT roi)
518{
519 switch (DstFormat)
520 {
521 case PIXEL_FORMAT_BGRX32:
522 case PIXEL_FORMAT_BGRA32:
523 return sse41_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
524
525 default:
526 return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
527 }
528}
529
530/****************************************************************************/
531/* sse41 RGB -> YUV420 conversion **/
532/****************************************************************************/
533
555#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
556#define BGRX_U_FACTORS \
557 _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
558#define BGRX_V_FACTORS \
559 _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
560#define CONST128_FACTORS _mm_set1_epi8(-128)
561
562#define Y_SHIFT 7
563#define U_SHIFT 8
564#define V_SHIFT 8
565
566/*
567TODO:
568RGB[AX] can simply be supported using the following factors. And instead of loading the
569globals directly the functions below could be passed pointers to the correct vectors
570depending on the source picture format.
571
572PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
573 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
574};
575PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
576 -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
577};
578PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
579 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
580};
581*/
582
583static inline void sse41_BGRX_TO_YUV(const BYTE* WINPR_RESTRICT pLine1, BYTE* WINPR_RESTRICT pYLine,
584 BYTE* WINPR_RESTRICT pULine, BYTE* WINPR_RESTRICT pVLine)
585{
586 const BYTE r1 = pLine1[2];
587 const BYTE g1 = pLine1[1];
588 const BYTE b1 = pLine1[0];
589
590 if (pYLine)
591 pYLine[0] = RGB2Y(r1, g1, b1);
592 if (pULine)
593 pULine[0] = RGB2U(r1, g1, b1);
594 if (pVLine)
595 pVLine[0] = RGB2V(r1, g1, b1);
596}
597
598/* compute the luma (Y) component from a single rgb source line */
599
600static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
601{
602 const __m128i y_factors = BGRX_Y_FACTORS;
603 const __m128i* argb = (const __m128i*)src;
604 __m128i* ydst = (__m128i*)dst;
605
606 UINT32 x = 0;
607
608 for (; x < width - width % 16; x += 16)
609 {
610 /* store 16 rgba pixels in 4 128 bit registers */
611 __m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels
612 {
613 x0 = _mm_maddubs_epi16(x0, y_factors);
614
615 __m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels
616 x1 = _mm_maddubs_epi16(x1, y_factors);
617 x0 = _mm_hadds_epi16(x0, x1);
618 x0 = _mm_srli_epi16(x0, Y_SHIFT);
619 }
620
621 __m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels
622 {
623 x2 = _mm_maddubs_epi16(x2, y_factors);
624
625 __m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels
626 x3 = _mm_maddubs_epi16(x3, y_factors);
627 x2 = _mm_hadds_epi16(x2, x3);
628 x2 = _mm_srli_epi16(x2, Y_SHIFT);
629 }
630
631 x0 = _mm_packus_epi16(x0, x2);
632 /* save to y plane */
633 STORE_SI128(ydst++, x0);
634 }
635
636 for (; x < width; x++)
637 {
638 sse41_BGRX_TO_YUV(&src[4ULL * x], &dst[x], NULL, NULL);
639 }
640}
641
642/* compute the chrominance (UV) components from two rgb source lines */
643
644static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
645 const BYTE* WINPR_RESTRICT src2,
646 BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
647 UINT32 width)
648{
649 const __m128i u_factors = BGRX_U_FACTORS;
650 const __m128i v_factors = BGRX_V_FACTORS;
651 const __m128i vector128 = CONST128_FACTORS;
652
653 size_t x = 0;
654
655 for (; x < width - width % 16; x += 16)
656 {
657 const __m128i* rgb1 = (const __m128i*)&src1[4ULL * x];
658 const __m128i* rgb2 = (const __m128i*)&src2[4ULL * x];
659 __m64* udst = (__m64*)&dst1[x / 2];
660 __m64* vdst = (__m64*)&dst2[x / 2];
661
662 /* subsample 16x2 pixels into 16x1 pixels */
663 __m128i x0 = LOAD_SI128(&rgb1[0]);
664 __m128i x4 = LOAD_SI128(&rgb2[0]);
665 x0 = _mm_avg_epu8(x0, x4);
666
667 __m128i x1 = LOAD_SI128(&rgb1[1]);
668 x4 = LOAD_SI128(&rgb2[1]);
669 x1 = _mm_avg_epu8(x1, x4);
670
671 __m128i x2 = LOAD_SI128(&rgb1[2]);
672 x4 = LOAD_SI128(&rgb2[2]);
673 x2 = _mm_avg_epu8(x2, x4);
674
675 __m128i x3 = LOAD_SI128(&rgb1[3]);
676 x4 = LOAD_SI128(&rgb2[3]);
677 x3 = _mm_avg_epu8(x3, x4);
678
679 /* subsample these 16x1 pixels into 8x1 pixels */
685 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
686 x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
687 x0 = _mm_avg_epu8(x0, x4);
688 x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
689 x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
690 x1 = _mm_avg_epu8(x1, x4);
691 /* multiplications and subtotals */
692 x2 = _mm_maddubs_epi16(x0, u_factors);
693 x3 = _mm_maddubs_epi16(x1, u_factors);
694 x4 = _mm_maddubs_epi16(x0, v_factors);
695 __m128i x5 = _mm_maddubs_epi16(x1, v_factors);
696 /* the total sums */
697 x0 = _mm_hadd_epi16(x2, x3);
698 x1 = _mm_hadd_epi16(x4, x5);
699 /* shift the results */
700 x0 = _mm_srai_epi16(x0, U_SHIFT);
701 x1 = _mm_srai_epi16(x1, V_SHIFT);
702 /* pack the 16 words into bytes */
703 x0 = _mm_packs_epi16(x0, x1);
704 /* add 128 */
705 x0 = _mm_sub_epi8(x0, vector128);
706 /* the lower 8 bytes go to the u plane */
707 _mm_storel_pi(udst, _mm_castsi128_ps(x0));
708 /* the upper 8 bytes go to the v plane */
709 _mm_storeh_pi(vdst, _mm_castsi128_ps(x0));
710 }
711
712 for (; x < width - width % 2; x += 2)
713 {
714 BYTE u[4] = { 0 };
715 BYTE v[4] = { 0 };
716 sse41_BGRX_TO_YUV(&src1[4ULL * x], NULL, &u[0], &v[0]);
717 sse41_BGRX_TO_YUV(&src1[4ULL * (1ULL + x)], NULL, &u[1], &v[1]);
718 sse41_BGRX_TO_YUV(&src2[4ULL * x], NULL, &u[2], &v[2]);
719 sse41_BGRX_TO_YUV(&src2[4ULL * (1ULL + x)], NULL, &u[3], &v[3]);
720 const INT16 u4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)u[0] + u[1] + u[2] + u[3]);
721 const INT16 uu = WINPR_ASSERTING_INT_CAST(INT16, u4 / 4);
722 const BYTE u8 = CLIP(uu);
723 dst1[x / 2] = u8;
724
725 const INT16 v4 = WINPR_ASSERTING_INT_CAST(INT16, (INT16)v[0] + v[1] + v[2] + v[3]);
726 const INT16 vu = WINPR_ASSERTING_INT_CAST(INT16, v4 / 4);
727 const BYTE v8 = CLIP(vu);
728 dst2[x / 2] = v8;
729 }
730}
731
732static pstatus_t sse41_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
733 BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
734 const prim_size_t* WINPR_RESTRICT roi)
735{
736 if (roi->height < 1 || roi->width < 1)
737 {
738 return !PRIMITIVES_SUCCESS;
739 }
740
741 size_t y = 0;
742 for (; y < roi->height - roi->height % 2; y += 2)
743 {
744 const BYTE* line1 = &pSrc[y * srcStep];
745 const BYTE* line2 = &pSrc[(1ULL + y) * srcStep];
746 BYTE* ydst1 = &pDst[0][y * dstStep[0]];
747 BYTE* ydst2 = &pDst[0][(1ULL + y) * dstStep[0]];
748 BYTE* udst = &pDst[1][y / 2 * dstStep[1]];
749 BYTE* vdst = &pDst[2][y / 2 * dstStep[2]];
750
751 sse41_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
752 sse41_RGBToYUV420_BGRX_Y(line1, ydst1, roi->width);
753 sse41_RGBToYUV420_BGRX_Y(line2, ydst2, roi->width);
754 }
755
756 for (; y < roi->height; y++)
757 {
758 const BYTE* line = &pSrc[y * srcStep];
759 BYTE* ydst = &pDst[0][1ULL * y * dstStep[0]];
760 sse41_RGBToYUV420_BGRX_Y(line, ydst, roi->width);
761 }
762
763 return PRIMITIVES_SUCCESS;
764}
765
766static pstatus_t sse41_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
767 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
768 const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
769{
770 switch (srcFormat)
771 {
772 case PIXEL_FORMAT_BGRX32:
773 case PIXEL_FORMAT_BGRA32:
774 return sse41_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
775
776 default:
777 return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
778 }
779}
780
781/****************************************************************************/
782/* sse41 RGB -> AVC444-YUV conversion **/
783/****************************************************************************/
784
785static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
786 const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
787 BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
788 BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
789 BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
790{
791 const __m128i* argbEven = (const __m128i*)srcEven;
792 const __m128i* argbOdd = (const __m128i*)srcOdd;
793 const __m128i y_factors = BGRX_Y_FACTORS;
794 const __m128i u_factors = BGRX_U_FACTORS;
795 const __m128i v_factors = BGRX_V_FACTORS;
796 const __m128i vector128 = CONST128_FACTORS;
797
798 UINT32 x = 0;
799 for (; x < width - width % 16; x += 16)
800 {
801 /* store 16 rgba pixels in 4 128 bit registers */
802 const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels
803 const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels
804 const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels
805 const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels
806 const __m128i xo1 = LOAD_SI128(argbOdd++); // 1st 4 pixels
807 const __m128i xo2 = LOAD_SI128(argbOdd++); // 2nd 4 pixels
808 const __m128i xo3 = LOAD_SI128(argbOdd++); // 3rd 4 pixels
809 const __m128i xo4 = LOAD_SI128(argbOdd++); // 4th 4 pixels
810 {
811 /* Y: multiplications with subtotals and horizontal sums */
812 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
813 _mm_maddubs_epi16(xe2, y_factors)),
814 Y_SHIFT);
815 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
816 _mm_maddubs_epi16(xe4, y_factors)),
817 Y_SHIFT);
818 const __m128i ye = _mm_packus_epi16(ye1, ye2);
819 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
820 _mm_maddubs_epi16(xo2, y_factors)),
821 Y_SHIFT);
822 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
823 _mm_maddubs_epi16(xo4, y_factors)),
824 Y_SHIFT);
825 const __m128i yo = _mm_packus_epi16(yo1, yo2);
826 /* store y [b1] */
827 STORE_SI128(b1Even, ye);
828 b1Even += 16;
829
830 if (b1Odd)
831 {
832 STORE_SI128(b1Odd, yo);
833 b1Odd += 16;
834 }
835 }
836 {
837 /* We have now
838 * 16 even U values in ue
839 * 16 odd U values in uo
840 *
841 * We need to split these according to
842 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
843 __m128i ue;
844 __m128i uo = { 0 };
845 {
846 const __m128i ue1 =
847 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
848 _mm_maddubs_epi16(xe2, u_factors)),
849 U_SHIFT);
850 const __m128i ue2 =
851 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
852 _mm_maddubs_epi16(xe4, u_factors)),
853 U_SHIFT);
854 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
855 }
856
857 if (b1Odd)
858 {
859 const __m128i uo1 =
860 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
861 _mm_maddubs_epi16(xo2, u_factors)),
862 U_SHIFT);
863 const __m128i uo2 =
864 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
865 _mm_maddubs_epi16(xo4, u_factors)),
866 U_SHIFT);
867 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
868 }
869
870 /* Now we need the following storage distribution:
871 * 2x 2y -> b2
872 * x 2y+1 -> b4
873 * 2x+1 2y -> b6 */
874 if (b1Odd) /* b2 */
875 {
876 const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
877 const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
878 const __m128i hi = _mm_add_epi16(ueh, uoh);
879 const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
880 const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
881 const __m128i lo = _mm_add_epi16(uel, uol);
882 const __m128i added = _mm_hadd_epi16(lo, hi);
883 const __m128i avg16 = _mm_srai_epi16(added, 2);
884 const __m128i avg = _mm_packus_epi16(avg16, avg16);
885 _mm_storel_epi64((__m128i*)b2, avg);
886 }
887 else
888 {
889 const __m128i mask =
890 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
891 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
892 const __m128i ud = _mm_shuffle_epi8(ue, mask);
893 _mm_storel_epi64((__m128i*)b2, ud);
894 }
895
896 b2 += 8;
897
898 if (b1Odd) /* b4 */
899 {
900 STORE_SI128(b4, uo);
901 b4 += 16;
902 }
903
904 {
905 /* b6 */
906 const __m128i mask =
907 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
908 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
909 const __m128i ude = _mm_shuffle_epi8(ue, mask);
910 _mm_storel_epi64((__m128i*)b6, ude);
911 b6 += 8;
912 }
913 }
914 {
915 /* We have now
916 * 16 even V values in ue
917 * 16 odd V values in uo
918 *
919 * We need to split these according to
920 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
921 __m128i ve;
922 __m128i vo = { 0 };
923 {
924 const __m128i ve1 =
925 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
926 _mm_maddubs_epi16(xe2, v_factors)),
927 V_SHIFT);
928 const __m128i ve2 =
929 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
930 _mm_maddubs_epi16(xe4, v_factors)),
931 V_SHIFT);
932 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
933 }
934
935 if (b1Odd)
936 {
937 const __m128i vo1 =
938 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
939 _mm_maddubs_epi16(xo2, v_factors)),
940 V_SHIFT);
941 const __m128i vo2 =
942 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
943 _mm_maddubs_epi16(xo4, v_factors)),
944 V_SHIFT);
945 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
946 }
947
948 /* Now we need the following storage distribution:
949 * 2x 2y -> b3
950 * x 2y+1 -> b5
951 * 2x+1 2y -> b7 */
952 if (b1Odd) /* b3 */
953 {
954 const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
955 const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
956 const __m128i hi = _mm_add_epi16(veh, voh);
957 const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
958 const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
959 const __m128i lo = _mm_add_epi16(vel, vol);
960 const __m128i added = _mm_hadd_epi16(lo, hi);
961 const __m128i avg16 = _mm_srai_epi16(added, 2);
962 const __m128i avg = _mm_packus_epi16(avg16, avg16);
963 _mm_storel_epi64((__m128i*)b3, avg);
964 }
965 else
966 {
967 const __m128i mask =
968 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
969 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
970 const __m128i vd = _mm_shuffle_epi8(ve, mask);
971 _mm_storel_epi64((__m128i*)b3, vd);
972 }
973
974 b3 += 8;
975
976 if (b1Odd) /* b5 */
977 {
978 STORE_SI128(b5, vo);
979 b5 += 16;
980 }
981
982 {
983 /* b7 */
984 const __m128i mask =
985 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
986 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
987 const __m128i vde = _mm_shuffle_epi8(ve, mask);
988 _mm_storel_epi64((__m128i*)b7, vde);
989 b7 += 8;
990 }
991 }
992 }
993
994 general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
995 b7, width);
996}
997
998static pstatus_t sse41_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc,
999 WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1000 BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[],
1001 BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[],
1002 const prim_size_t* WINPR_RESTRICT roi)
1003{
1004 if (roi->height < 1 || roi->width < 1)
1005 return !PRIMITIVES_SUCCESS;
1006
1007 size_t y = 0;
1008 for (; y < roi->height - roi->height % 2; y += 2)
1009 {
1010 const BYTE* srcEven = pSrc + y * srcStep;
1011 const BYTE* srcOdd = pSrc + (y + 1) * srcStep;
1012 const size_t i = y >> 1;
1013 const size_t n = (i & (size_t)~7) + i;
1014 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1015 BYTE* b1Odd = (b1Even + dst1Step[0]);
1016 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1017 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1018 BYTE* b4 = pDst2[0] + 1ULL * dst2Step[0] * n;
1019 BYTE* b5 = b4 + 8ULL * dst2Step[0];
1020 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1021 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1022 sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
1023 roi->width);
1024 }
1025
1026 for (; y < roi->height; y++)
1027 {
1028 const BYTE* srcEven = pSrc + y * srcStep;
1029 BYTE* b1Even = pDst1[0] + y * dst1Step[0];
1030 BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
1031 BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
1032 BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
1033 BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
1034 general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(0, srcEven, NULL, b1Even, NULL, b2, b3, NULL, NULL,
1035 b6, b7, roi->width);
1036 }
1037
1038 return PRIMITIVES_SUCCESS;
1039}
1040
1041static pstatus_t sse41_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1042 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1043 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1044 const UINT32 dst2Step[],
1045 const prim_size_t* WINPR_RESTRICT roi)
1046{
1047 switch (srcFormat)
1048 {
1049 case PIXEL_FORMAT_BGRX32:
1050 case PIXEL_FORMAT_BGRA32:
1051 return sse41_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1052 dst2Step, roi);
1053
1054 default:
1055 return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1056 dst2Step, roi);
1057 }
1058}
1059
1060/* Mapping of arguments:
1061 *
1062 * b1 [even lines] -> yLumaDstEven
1063 * b1 [odd lines] -> yLumaDstOdd
1064 * b2 -> uLumaDst
1065 * b3 -> vLumaDst
1066 * b4 -> yChromaDst1
1067 * b5 -> yChromaDst2
1068 * b6 -> uChromaDst1
1069 * b7 -> uChromaDst2
1070 * b8 -> vChromaDst1
1071 * b9 -> vChromaDst2
1072 */
1073static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
1074 const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
1075 BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
1076 BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
1077 BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
1078 BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
1079 BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
1080 BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
1081{
1082 const __m128i vector128 = CONST128_FACTORS;
1083 const __m128i* argbEven = (const __m128i*)srcEven;
1084 const __m128i* argbOdd = (const __m128i*)srcOdd;
1085
1086 UINT32 x = 0;
1087 for (; x < width - width % 16; x += 16)
1088 {
1089 /* store 16 rgba pixels in 4 128 bit registers
1090 * for even and odd rows.
1091 */
1092 const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */
1093 const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */
1094 const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */
1095 const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */
1096 const __m128i xo1 = LOAD_SI128(argbOdd++); /* 1st 4 pixels */
1097 const __m128i xo2 = LOAD_SI128(argbOdd++); /* 2nd 4 pixels */
1098 const __m128i xo3 = LOAD_SI128(argbOdd++); /* 3rd 4 pixels */
1099 const __m128i xo4 = LOAD_SI128(argbOdd++); /* 4th 4 pixels */
1100 {
1101 /* Y: multiplications with subtotals and horizontal sums */
1102 const __m128i y_factors = BGRX_Y_FACTORS;
1103 const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
1104 _mm_maddubs_epi16(xe2, y_factors)),
1105 Y_SHIFT);
1106 const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
1107 _mm_maddubs_epi16(xe4, y_factors)),
1108 Y_SHIFT);
1109 const __m128i ye = _mm_packus_epi16(ye1, ye2);
1110 /* store y [b1] */
1111 STORE_SI128(yLumaDstEven, ye);
1112 yLumaDstEven += 16;
1113 }
1114
1115 if (yLumaDstOdd)
1116 {
1117 const __m128i y_factors = BGRX_Y_FACTORS;
1118 const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
1119 _mm_maddubs_epi16(xo2, y_factors)),
1120 Y_SHIFT);
1121 const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
1122 _mm_maddubs_epi16(xo4, y_factors)),
1123 Y_SHIFT);
1124 const __m128i yo = _mm_packus_epi16(yo1, yo2);
1125 STORE_SI128(yLumaDstOdd, yo);
1126 yLumaDstOdd += 16;
1127 }
1128
1129 {
1130 /* We have now
1131 * 16 even U values in ue
1132 * 16 odd U values in uo
1133 *
1134 * We need to split these according to
1135 * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
1136 /* U: multiplications with subtotals and horizontal sums */
1137 __m128i ue;
1138 __m128i uo;
1139 __m128i uavg;
1140 {
1141 const __m128i u_factors = BGRX_U_FACTORS;
1142 const __m128i ue1 =
1143 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
1144 _mm_maddubs_epi16(xe2, u_factors)),
1145 U_SHIFT);
1146 const __m128i ue2 =
1147 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
1148 _mm_maddubs_epi16(xe4, u_factors)),
1149 U_SHIFT);
1150 const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
1151 ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
1152 uavg = ueavg;
1153 }
1154 {
1155 const __m128i u_factors = BGRX_U_FACTORS;
1156 const __m128i uo1 =
1157 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
1158 _mm_maddubs_epi16(xo2, u_factors)),
1159 U_SHIFT);
1160 const __m128i uo2 =
1161 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
1162 _mm_maddubs_epi16(xo4, u_factors)),
1163 U_SHIFT);
1164 const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
1165 uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
1166 uavg = _mm_add_epi16(uavg, uoavg);
1167 uavg = _mm_srai_epi16(uavg, 2);
1168 uavg = _mm_packs_epi16(uavg, uoavg);
1169 uavg = _mm_sub_epi8(uavg, vector128);
1170 }
1171 /* Now we need the following storage distribution:
1172 * 2x 2y -> uLumaDst
1173 * 2x+1 y -> yChromaDst1
1174 * 4x 2y+1 -> uChromaDst1
1175 * 4x+2 2y+1 -> vChromaDst1 */
1176 {
1177 const __m128i mask =
1178 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1179 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1180 const __m128i ude = _mm_shuffle_epi8(ue, mask);
1181 _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
1182 yEvenChromaDst1 += 8;
1183 }
1184
1185 if (yLumaDstOdd)
1186 {
1187 const __m128i mask =
1188 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1189 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1190 const __m128i udo /* codespell:ignore udo */ = _mm_shuffle_epi8(uo, mask);
1191 _mm_storel_epi64((__m128i*)yOddChromaDst1, udo); // codespell:ignore udo
1192 yOddChromaDst1 += 8;
1193 }
1194
1195 if (yLumaDstOdd)
1196 {
1197 const __m128i mask =
1198 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1199 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1200 const __m128i ud = _mm_shuffle_epi8(uo, mask);
1201 int* uDst1 = (int*)uChromaDst1;
1202 int* vDst1 = (int*)vChromaDst1;
1203 const int* src = (const int*)&ud;
1204 _mm_stream_si32(uDst1, src[0]);
1205 _mm_stream_si32(vDst1, src[1]);
1206 uChromaDst1 += 4;
1207 vChromaDst1 += 4;
1208 }
1209
1210 if (yLumaDstOdd)
1211 {
1212 _mm_storel_epi64((__m128i*)uLumaDst, uavg);
1213 uLumaDst += 8;
1214 }
1215 else
1216 {
1217 const __m128i mask =
1218 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1219 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1220 const __m128i ud = _mm_shuffle_epi8(ue, mask);
1221 _mm_storel_epi64((__m128i*)uLumaDst, ud);
1222 uLumaDst += 8;
1223 }
1224 }
1225
1226 {
1227 /* V: multiplications with subtotals and horizontal sums */
1228 __m128i ve;
1229 __m128i vo;
1230 __m128i vavg;
1231 {
1232 const __m128i v_factors = BGRX_V_FACTORS;
1233 const __m128i ve1 =
1234 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
1235 _mm_maddubs_epi16(xe2, v_factors)),
1236 V_SHIFT);
1237 const __m128i ve2 =
1238 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
1239 _mm_maddubs_epi16(xe4, v_factors)),
1240 V_SHIFT);
1241 const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
1242 ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
1243 vavg = veavg;
1244 }
1245 {
1246 const __m128i v_factors = BGRX_V_FACTORS;
1247 const __m128i vo1 =
1248 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
1249 _mm_maddubs_epi16(xo2, v_factors)),
1250 V_SHIFT);
1251 const __m128i vo2 =
1252 _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
1253 _mm_maddubs_epi16(xo4, v_factors)),
1254 V_SHIFT);
1255 const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
1256 vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
1257 vavg = _mm_add_epi16(vavg, voavg);
1258 vavg = _mm_srai_epi16(vavg, 2);
1259 vavg = _mm_packs_epi16(vavg, voavg);
1260 vavg = _mm_sub_epi8(vavg, vector128);
1261 }
1262 /* Now we need the following storage distribution:
1263 * 2x 2y -> vLumaDst
1264 * 2x+1 y -> yChromaDst2
1265 * 4x 2y+1 -> uChromaDst2
1266 * 4x+2 2y+1 -> vChromaDst2 */
1267 {
1268 const __m128i mask =
1269 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1270 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1271 __m128i vde = _mm_shuffle_epi8(ve, mask);
1272 _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
1273 yEvenChromaDst2 += 8;
1274 }
1275
1276 if (yLumaDstOdd)
1277 {
1278 const __m128i mask =
1279 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1280 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
1281 __m128i vdo = _mm_shuffle_epi8(vo, mask);
1282 _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
1283 yOddChromaDst2 += 8;
1284 }
1285
1286 if (yLumaDstOdd)
1287 {
1288 const __m128i mask =
1289 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1290 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
1291 const __m128i vd = _mm_shuffle_epi8(vo, mask);
1292 int* uDst2 = (int*)uChromaDst2;
1293 int* vDst2 = (int*)vChromaDst2;
1294 const int* src = (const int*)&vd;
1295 _mm_stream_si32(uDst2, src[0]);
1296 _mm_stream_si32(vDst2, src[1]);
1297 uChromaDst2 += 4;
1298 vChromaDst2 += 4;
1299 }
1300
1301 if (yLumaDstOdd)
1302 {
1303 _mm_storel_epi64((__m128i*)vLumaDst, vavg);
1304 vLumaDst += 8;
1305 }
1306 else
1307 {
1308 const __m128i mask =
1309 _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
1310 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
1311 __m128i vd = _mm_shuffle_epi8(ve, mask);
1312 _mm_storel_epi64((__m128i*)vLumaDst, vd);
1313 vLumaDst += 8;
1314 }
1315 }
1316 }
1317
1318 general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(x, srcEven, srcOdd, yLumaDstEven, yLumaDstOdd,
1319 uLumaDst, vLumaDst, yEvenChromaDst1, yEvenChromaDst2,
1320 yOddChromaDst1, yOddChromaDst2, uChromaDst1,
1321 uChromaDst2, vChromaDst1, vChromaDst2, width);
1322}
1323
1324static pstatus_t sse41_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc,
1325 WINPR_ATTR_UNUSED UINT32 srcFormat, UINT32 srcStep,
1326 BYTE* WINPR_RESTRICT pDst1[], const UINT32 dst1Step[],
1327 BYTE* WINPR_RESTRICT pDst2[], const UINT32 dst2Step[],
1328 const prim_size_t* WINPR_RESTRICT roi)
1329{
1330 if (roi->height < 1 || roi->width < 1)
1331 return !PRIMITIVES_SUCCESS;
1332
1333 size_t y = 0;
1334 for (; y < roi->height - roi->height % 2; y += 2)
1335 {
1336 const BYTE* srcEven = (pSrc + y * srcStep);
1337 const BYTE* srcOdd = (srcEven + srcStep);
1338 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1339 BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
1340 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1341 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1342 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1343 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1344 BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
1345 BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
1346 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1347 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1348 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1349 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1350 sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
1351 dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
1352 dstOddChromaY1, dstOddChromaY2, dstChromaU1,
1353 dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
1354 }
1355
1356 for (; y < roi->height; y++)
1357 {
1358 const BYTE* srcEven = (pSrc + y * srcStep);
1359 BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
1360 BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
1361 BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
1362 BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
1363 BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
1364 BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
1365 BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
1366 BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
1367 BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
1368 general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(0, srcEven, NULL, dstLumaYEven, NULL, dstLumaU,
1369 dstLumaV, dstEvenChromaY1, dstEvenChromaY2, NULL,
1370 NULL, dstChromaU1, dstChromaU2, dstChromaV1,
1371 dstChromaV2, roi->width);
1372 }
1373
1374 return PRIMITIVES_SUCCESS;
1375}
1376
1377static pstatus_t sse41_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
1378 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
1379 const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
1380 const UINT32 dst2Step[],
1381 const prim_size_t* WINPR_RESTRICT roi)
1382{
1383 switch (srcFormat)
1384 {
1385 case PIXEL_FORMAT_BGRX32:
1386 case PIXEL_FORMAT_BGRA32:
1387 return sse41_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1388 dst2Step, roi);
1389
1390 default:
1391 return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
1392 dst2Step, roi);
1393 }
1394}
1395
1396static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const UINT32 srcStep[],
1397 BYTE* WINPR_RESTRICT pDstRaw[], const UINT32 dstStep[],
1398 const RECTANGLE_16* WINPR_RESTRICT roi)
1399{
1400 const UINT32 nWidth = roi->right - roi->left;
1401 const UINT32 nHeight = roi->bottom - roi->top;
1402 const UINT32 halfWidth = (nWidth + 1) / 2;
1403 const UINT32 halfPad = halfWidth % 16;
1404 const UINT32 halfHeight = (nHeight + 1) / 2;
1405 const UINT32 oddY = 1;
1406 const UINT32 evenY = 0;
1407 const UINT32 oddX = 1;
1408 const UINT32 evenX = 0;
1409 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1410 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1411 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1412 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1413 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1414 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1415
1416 /* Y data is already here... */
1417 /* B1 */
1418 for (size_t y = 0; y < nHeight; y++)
1419 {
1420 const BYTE* Ym = pSrc[0] + y * srcStep[0];
1421 BYTE* pY = pDst[0] + y * dstStep[0];
1422 memcpy(pY, Ym, nWidth);
1423 }
1424
1425 /* The first half of U, V are already here part of this frame. */
1426 /* B2 and B3 */
1427 for (size_t y = 0; y < halfHeight; y++)
1428 {
1429 const size_t val2y = (2 * y + evenY);
1430 const size_t val2y1 = val2y + oddY;
1431 const BYTE* Um = pSrc[1] + 1ULL * srcStep[1] * y;
1432 const BYTE* Vm = pSrc[2] + 1ULL * srcStep[2] * y;
1433 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * val2y;
1434 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * val2y;
1435 BYTE* pU1 = pDst[1] + 1ULL * dstStep[1] * val2y1;
1436 BYTE* pV1 = pDst[2] + 1ULL * dstStep[2] * val2y1;
1437
1438 size_t x = 0;
1439 for (; x < halfWidth - halfPad; x += 16)
1440 {
1441 const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1442 const __m128i unpackLow =
1443 _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
1444 {
1445 const __m128i u = LOAD_SI128(&Um[x]);
1446 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1447 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1448 STORE_SI128(&pU[2ULL * x], uHigh);
1449 STORE_SI128(&pU[2ULL * x + 16], uLow);
1450 STORE_SI128(&pU1[2ULL * x], uHigh);
1451 STORE_SI128(&pU1[2ULL * x + 16], uLow);
1452 }
1453 {
1454 const __m128i u = LOAD_SI128(&Vm[x]);
1455 const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
1456 const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
1457 STORE_SI128(&pV[2 * x], uHigh);
1458 STORE_SI128(&pV[2 * x + 16], uLow);
1459 STORE_SI128(&pV1[2 * x], uHigh);
1460 STORE_SI128(&pV1[2 * x + 16], uLow);
1461 }
1462 }
1463
1464 for (; x < halfWidth; x++)
1465 {
1466 const size_t val2x = 2 * x + evenX;
1467 const size_t val2x1 = val2x + oddX;
1468 pU[val2x] = Um[x];
1469 pV[val2x] = Vm[x];
1470 pU[val2x1] = Um[x];
1471 pV[val2x1] = Vm[x];
1472 pU1[val2x] = Um[x];
1473 pV1[val2x] = Vm[x];
1474 pU1[val2x1] = Um[x];
1475 pV1[val2x1] = Vm[x];
1476 }
1477 }
1478
1479 return PRIMITIVES_SUCCESS;
1480}
1481
1482static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
1483 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
1484 const UINT32 dstStep[3],
1485 const RECTANGLE_16* WINPR_RESTRICT roi)
1486{
1487 const UINT32 mod = 16;
1488 UINT32 uY = 0;
1489 UINT32 vY = 0;
1490 const UINT32 nWidth = roi->right - roi->left;
1491 const UINT32 nHeight = roi->bottom - roi->top;
1492 const UINT32 halfWidth = (nWidth + 1) / 2;
1493 const UINT32 halfPad = halfWidth % 16;
1494 const UINT32 halfHeight = (nHeight + 1) / 2;
1495 const UINT32 oddY = 1;
1496 const UINT32 evenY = 0;
1497 const UINT32 oddX = 1;
1498 /* The auxiliary frame is aligned to multiples of 16x16.
1499 * We need the padded height for B4 and B5 conversion. */
1500 const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
1501 const BYTE* pSrc[3] = { pSrcRaw[0] + 1ULL * roi->top * srcStep[0] + roi->left,
1502 pSrcRaw[1] + 1ULL * roi->top / 2 * srcStep[1] + roi->left / 2,
1503 pSrcRaw[2] + 1ULL * roi->top / 2 * srcStep[2] + roi->left / 2 };
1504 BYTE* pDst[3] = { pDstRaw[0] + 1ULL * roi->top * dstStep[0] + roi->left,
1505 pDstRaw[1] + 1ULL * roi->top * dstStep[1] + roi->left,
1506 pDstRaw[2] + 1ULL * roi->top * dstStep[2] + roi->left };
1507 const __m128i zero = _mm_setzero_si128();
1508 const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1509 (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1510
1511 /* The second half of U and V is a bit more tricky... */
1512 /* B4 and B5 */
1513 for (size_t y = 0; y < padHeigth; y++)
1514 {
1515 const BYTE* Ya = pSrc[0] + 1ULL * srcStep[0] * y;
1516 BYTE* pX = NULL;
1517
1518 if ((y) % mod < (mod + 1) / 2)
1519 {
1520 const UINT32 pos = (2 * uY++ + oddY);
1521
1522 if (pos >= nHeight)
1523 continue;
1524
1525 pX = pDst[1] + 1ULL * dstStep[1] * pos;
1526 }
1527 else
1528 {
1529 const UINT32 pos = (2 * vY++ + oddY);
1530
1531 if (pos >= nHeight)
1532 continue;
1533
1534 pX = pDst[2] + 1ULL * dstStep[2] * pos;
1535 }
1536
1537 memcpy(pX, Ya, nWidth);
1538 }
1539
1540 /* B6 and B7 */
1541 for (size_t y = 0; y < halfHeight; y++)
1542 {
1543 const size_t val2y = (y * 2 + evenY);
1544 const BYTE* Ua = pSrc[1] + srcStep[1] * y;
1545 const BYTE* Va = pSrc[2] + srcStep[2] * y;
1546 BYTE* pU = pDst[1] + dstStep[1] * val2y;
1547 BYTE* pV = pDst[2] + dstStep[2] * val2y;
1548
1549 size_t x = 0;
1550 for (; x < halfWidth - halfPad; x += 16)
1551 {
1552 {
1553 const __m128i u = LOAD_SI128(&Ua[x]);
1554 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1555 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1556 _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1557 _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1558 }
1559 {
1560 const __m128i u = LOAD_SI128(&Va[x]);
1561 const __m128i u2 = _mm_unpackhi_epi8(u, zero);
1562 const __m128i u1 = _mm_unpacklo_epi8(u, zero);
1563 _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
1564 _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
1565 }
1566 }
1567
1568 for (; x < halfWidth; x++)
1569 {
1570 const size_t val2x1 = (x * 2ULL + oddX);
1571 pU[val2x1] = Ua[x];
1572 pV[val2x1] = Va[x];
1573 }
1574 }
1575
1576 return PRIMITIVES_SUCCESS;
1577}
1578
1579static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
1580 UINT32 nTotalWidth, WINPR_ATTR_UNUSED UINT32 nTotalHeight,
1581 BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1582 const RECTANGLE_16* WINPR_RESTRICT roi)
1583{
1584 const UINT32 nWidth = roi->right - roi->left;
1585 const UINT32 nHeight = roi->bottom - roi->top;
1586 const UINT32 halfWidth = (nWidth + 1) / 2;
1587 const UINT32 halfPad = halfWidth % 16;
1588 const UINT32 halfHeight = (nHeight + 1) / 2;
1589 const UINT32 quaterWidth = (nWidth + 3) / 4;
1590 const UINT32 quaterPad = quaterWidth % 16;
1591 const __m128i zero = _mm_setzero_si128();
1592 const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
1593 (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
1594 const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
1595 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
1596 const __m128i shuffle1 =
1597 _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
1598 (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
1599 const __m128i shuffle2 =
1600 _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
1601 (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
1602
1603 /* B4 and B5: odd UV values for width/2, height */
1604 for (size_t y = 0; y < nHeight; y++)
1605 {
1606 const size_t yTop = y + roi->top;
1607 const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
1608 const BYTE* pYaV = pYaU + nTotalWidth / 2;
1609 BYTE* pU = pDst[1] + 1ULL * dstStep[1] * yTop + roi->left;
1610 BYTE* pV = pDst[2] + 1ULL * dstStep[2] * yTop + roi->left;
1611
1612 size_t x = 0;
1613 for (; x < halfWidth - halfPad; x += 16)
1614 {
1615 {
1616 const __m128i u = LOAD_SI128(&pYaU[x]);
1617 const __m128i u2 = _mm_unpackhi_epi8(zero, u);
1618 const __m128i u1 = _mm_unpacklo_epi8(zero, u);
1619 _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
1620 _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
1621 }
1622 {
1623 const __m128i v = LOAD_SI128(&pYaV[x]);
1624 const __m128i v2 = _mm_unpackhi_epi8(zero, v);
1625 const __m128i v1 = _mm_unpacklo_epi8(zero, v);
1626 _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
1627 _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
1628 }
1629 }
1630
1631 for (; x < halfWidth; x++)
1632 {
1633 const size_t odd = 2ULL * x + 1;
1634 pU[odd] = pYaU[x];
1635 pV[odd] = pYaV[x];
1636 }
1637 }
1638
1639 /* B6 - B9 */
1640 for (size_t y = 0; y < halfHeight; y++)
1641 {
1642 const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
1643 const BYTE* pUaV = pUaU + nTotalWidth / 4;
1644 const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
1645 const BYTE* pVaV = pVaU + nTotalWidth / 4;
1646 BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
1647 BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
1648
1649 UINT32 x = 0;
1650 for (; x < quaterWidth - quaterPad; x += 16)
1651 {
1652 {
1653 const __m128i uU = LOAD_SI128(&pUaU[x]);
1654 const __m128i uV = LOAD_SI128(&pVaU[x]);
1655 const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
1656 const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
1657 const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
1658 const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
1659 const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
1660 const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
1661 _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
1662 _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
1663 _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
1664 _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
1665 }
1666 {
1667 const __m128i vU = LOAD_SI128(&pUaV[x]);
1668 const __m128i vV = LOAD_SI128(&pVaV[x]);
1669 const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
1670 const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
1671 const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
1672 const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
1673 const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
1674 const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
1675 _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
1676 _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
1677 _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
1678 _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
1679 }
1680 }
1681
1682 for (; x < quaterWidth; x++)
1683 {
1684 pU[4 * x + 0] = pUaU[x];
1685 pV[4 * x + 0] = pUaV[x];
1686 pU[4 * x + 2] = pVaU[x];
1687 pV[4 * x + 2] = pVaV[x];
1688 }
1689 }
1690
1691 return PRIMITIVES_SUCCESS;
1692}
1693
1694static pstatus_t sse41_YUV420CombineToYUV444(avc444_frame_type type,
1695 const BYTE* WINPR_RESTRICT pSrc[3],
1696 const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
1697 BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
1698 const RECTANGLE_16* WINPR_RESTRICT roi)
1699{
1700 if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
1701 return -1;
1702
1703 if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
1704 return -1;
1705
1706 if (!roi)
1707 return -1;
1708
1709 switch (type)
1710 {
1711 case AVC444_LUMA:
1712 return sse41_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1713
1714 case AVC444_CHROMAv1:
1715 return sse41_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
1716
1717 case AVC444_CHROMAv2:
1718 return sse41_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
1719
1720 default:
1721 return -1;
1722 }
1723}
1724#endif
1725
1726void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims)
1727{
1728#if defined(SSE_AVX_INTRINSICS_ENABLED)
1729 generic = primitives_get_generic();
1730
1731 WLog_VRB(PRIM_TAG, "SSE3/sse41 optimizations");
1732 prims->RGBToYUV420_8u_P3AC4R = sse41_RGBToYUV420;
1733 prims->RGBToAVC444YUV = sse41_RGBToAVC444YUV;
1734 prims->RGBToAVC444YUVv2 = sse41_RGBToAVC444YUVv2;
1735 prims->YUV420ToRGB_8u_P3AC4R = sse41_YUV420ToRGB;
1736 prims->YUV444ToRGB_8u_P3AC4R = sse41_YUV444ToRGB_8u_P3AC4R;
1737 prims->YUV420CombineToYUV444 = sse41_YUV420CombineToYUV444;
1738#else
1739 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or sse41 intrinsics not available");
1740 WINPR_UNUSED(prims);
1741#endif
1742}