FreeRDP
Loading...
Searching...
No Matches
prim_colors_sse2.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized Color conversion operations.
3 * vi:ts=4 sw=4:
4 *
5 * Copyright 2011 Stephen Erisman
6 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License"); you may
11 * not use this file except in compliance with the License. You may obtain
12 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16 * or implied. See the License for the specific language governing
17 * permissions and limitations under the License.
18 */
19
20#include <freerdp/config.h>
21
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
25
26#include "prim_colors.h"
27
28#include "prim_internal.h"
29#include "prim_templates.h"
30
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
32#include <emmintrin.h>
33
34static primitives_t* generic = NULL;
35
36#define CACHE_LINE_BYTES 64
37
38/* 1.403 << 14 */
39/* -0.344 << 14 */
40/* -0.714 << 14 */
41/* 1.770 << 14 */
42
43static const int32_t ycbcr_table[][4] = { { 1, 0, -1, 2 },
44 { 3, -1, -1, 4 },
45 { 6, -1, -3, 7 },
46 { 11, -3, -6, 14 },
47 { 22, -6, -11, 28 },
48 { 45, -11, -23, 57 },
49 { 90, -22, -46, 113 },
50 { 180, -44, -91, 227 },
51 { 359, -88, -183, 453 },
52 { 718, -176, -366, 906 },
53 { 1437, -352, -731, 1812 },
54 { 2873, -705, -1462, 3625 },
55 { 5747, -1409, -2925, 7250 },
56 { 11493, -2818, -5849, 14500 },
57 { 22987, -5636, -11698, 29000 },
58 { 45974, -11272, -23396, 57999 },
59 { 91947, -22544, -46793, 115999 },
60 { 183894, -45089, -93585, 231997 },
61 { 367788, -90178, -187171, 463995 },
62 { 735576, -180355, -374342, 927990 },
63 { 1471152, -360710, -748683, 1855980 },
64 { 2942304, -721420, -1497367, 3711959 },
65 { 5884609, -1442841, -2994733, 7423918 },
66 { 11769217, -2885681, -5989466, 14847836 },
67 { 23538434, -5771362, -11978932, 29695672 },
68 { 47076868, -11542725, -23957864, 59391345 },
69 { 94153736, -23085449, -47915729, 118782689 },
70 { 188307472, -46170898, -95831458, 237565379 },
71 { 376614945, -92341797, -191662916, 475130757 },
72 { 753229890, -184683594, -383325831, 950261514 },
73 { 1506459779, -369367187, -766651662, 1900523028 } };
74
75static inline __m128i mm_between_epi16_int(__m128i val, __m128i min, __m128i max)
76{
77 return _mm_min_epi16(max, _mm_max_epi16(val, min));
78}
79
80#define mm_between_epi16(_val, _min, _max) (_val) = mm_between_epi16_int((_val), (_min), (_max))
81
82static inline void mm_prefetch_buffer(const void* WINPR_RESTRICT buffer, size_t width,
83 size_t stride, size_t height)
84{
85 const size_t srcbump = stride / sizeof(__m128i);
86 const __m128i* buf = (const __m128i*)buffer;
87
88 for (size_t y = 0; y < height; y++)
89 {
90 const __m128i* line = &buf[y * srcbump];
91 for (size_t x = 0; x < width * sizeof(INT16) / sizeof(__m128i);
92 x += (CACHE_LINE_BYTES / sizeof(__m128i)))
93 {
94 const char* ptr = (const char*)&line[x];
95 _mm_prefetch(ptr, _MM_HINT_NTA);
96 }
97 }
98}
99
100/*---------------------------------------------------------------------------*/
101static pstatus_t
102sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3],
103 WINPR_ATTR_UNUSED UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
104 UINT32 dstStep,
105 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
106{
107 const __m128i zero = _mm_setzero_si128();
108 const __m128i max = _mm_set1_epi16(255);
109 const __m128i r_cr =
110 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
111 const __m128i g_cb =
112 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
113 const __m128i g_cr =
114 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
115 const __m128i b_cb =
116 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
117 const __m128i c4096 = _mm_set1_epi16(4096);
118 const INT16* y_buf = pSrc[0];
119 const INT16* cb_buf = pSrc[1];
120 const INT16* cr_buf = pSrc[2];
121 const UINT32 pad = roi->width % 16;
122 const UINT32 step = sizeof(__m128i) / sizeof(INT16);
123 const size_t imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
124 BYTE* d_buf = pDst;
125 const size_t dstPad = (dstStep - roi->width * 4);
126
127 mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height);
128 mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height);
129 mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height);
130
131 for (UINT32 yp = 0; yp < roi->height; ++yp)
132 {
133 for (size_t i = 0; i < imax; i += 2)
134 {
135 /* In order to use SSE2 signed 16-bit integer multiplication
136 * we need to convert the floating point factors to signed int
137 * without losing information.
138 * The result of this multiplication is 32 bit and we have two
139 * SSE instructions that return either the hi or lo word.
140 * Thus we will multiply the factors by the highest possible 2^n,
141 * take the upper 16 bits of the signed 32-bit result
142 * (_mm_mulhi_epi16) and correct this result by multiplying
143 * it by 2^(16-n).
144 *
145 * For the given factors in the conversion matrix the best
146 * possible n is 14.
147 *
148 * Example for calculating r:
149 * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
150 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
151 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
152 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
153 */
154 /* y = (y_r_buf[i] + 4096) >> 2 */
155 __m128i y1 = LOAD_SI128(y_buf);
156 y_buf += step;
157 y1 = _mm_add_epi16(y1, c4096);
158 y1 = _mm_srai_epi16(y1, 2);
159 /* cb = cb_g_buf[i]; */
160 __m128i cb1 = LOAD_SI128(cb_buf);
161 cb_buf += step;
162 /* cr = cr_b_buf[i]; */
163 __m128i cr1 = LOAD_SI128(cr_buf);
164 cr_buf += step;
165 /* (y + HIWORD(cr*22986)) >> 3 */
166 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
167 r1 = _mm_srai_epi16(r1, 3);
168 /* r_buf[i] = CLIP(r); */
169 mm_between_epi16(r1, zero, max);
170 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
171 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
172 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
173 g1 = _mm_srai_epi16(g1, 3);
174 /* g_buf[i] = CLIP(g); */
175 mm_between_epi16(g1, zero, max);
176 /* (y + HIWORD(cb*28999)) >> 3 */
177 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
178 b1 = _mm_srai_epi16(b1, 3);
179 /* b_buf[i] = CLIP(b); */
180 mm_between_epi16(b1, zero, max);
181 __m128i y2 = LOAD_SI128(y_buf);
182 y_buf += step;
183 y2 = _mm_add_epi16(y2, c4096);
184 y2 = _mm_srai_epi16(y2, 2);
185 /* cb = cb_g_buf[i]; */
186 __m128i cb2 = LOAD_SI128(cb_buf);
187 cb_buf += step;
188 /* cr = cr_b_buf[i]; */
189 __m128i cr2 = LOAD_SI128(cr_buf);
190 cr_buf += step;
191 /* (y + HIWORD(cr*22986)) >> 3 */
192 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
193 r2 = _mm_srai_epi16(r2, 3);
194 /* r_buf[i] = CLIP(r); */
195 mm_between_epi16(r2, zero, max);
196 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
197 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
198 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
199 g2 = _mm_srai_epi16(g2, 3);
200 /* g_buf[i] = CLIP(g); */
201 mm_between_epi16(g2, zero, max);
202 /* (y + HIWORD(cb*28999)) >> 3 */
203 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
204 b2 = _mm_srai_epi16(b2, 3);
205 /* b_buf[i] = CLIP(b); */
206 mm_between_epi16(b2, zero, max);
207 {
208 /* The comments below pretend these are 8-byte registers
209 * rather than 16-byte, for readability.
210 */
211 __m128i R0 = b1; /* R0 = 00B300B200B100B0 */
212 __m128i R1 = b2; /* R1 = 00B700B600B500B4 */
213 R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
214 R1 = g1; /* R1 = 00G300G200G100G0 */
215 __m128i R2 = g2; /* R2 = 00G700G600G500G4 */
216 R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
217 R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
218 R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
219 R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
220 R0 = r1; /* R0 = 00R300R200R100R0 */
221 __m128i R3 = r2; /* R3 = 00R700R600R500R4 */
222 R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
223 R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
224 __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
225 R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
226 R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
227 R0 = R4; /* R0 = R4 */
228 R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
229 R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
230 R2 = R3; /* R2 = R3 */
231 R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
232 R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
233 STORE_SI128(d_buf, R0); /* B1G1R1FFB0G0R0FF */
234 d_buf += sizeof(__m128i);
235 STORE_SI128(d_buf, R4); /* B3G3R3FFB2G2R2FF */
236 d_buf += sizeof(__m128i);
237 STORE_SI128(d_buf, R2); /* B5G5R5FFB4G4R4FF */
238 d_buf += sizeof(__m128i);
239 STORE_SI128(d_buf, R3); /* B7G7R7FFB6G6R6FF */
240 d_buf += sizeof(__m128i);
241 }
242 }
243
244 for (UINT32 i = 0; i < pad; i++)
245 {
246 const INT32 divisor = 16;
247 const INT32 Y = ((*y_buf++) + 4096) << divisor;
248 const INT32 Cb = (*cb_buf++);
249 const INT32 Cr = (*cr_buf++);
250 const INT32 CrR = Cr * ycbcr_table[divisor][0];
251 const INT32 CrG = Cr * ycbcr_table[divisor][1];
252 const INT32 CbG = Cb * ycbcr_table[divisor][2];
253 const INT32 CbB = Cb * ycbcr_table[divisor][3];
254 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
255 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
256 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
257 *d_buf++ = CLIP(B);
258 *d_buf++ = CLIP(G);
259 *d_buf++ = CLIP(R);
260 *d_buf++ = 0xFF;
261 }
262
263 d_buf += dstPad;
264 }
265
266 return PRIMITIVES_SUCCESS;
267}
268
269/*---------------------------------------------------------------------------*/
270static pstatus_t
271sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3],
272 WINPR_ATTR_UNUSED UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
273 UINT32 dstStep,
274 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
275{
276 const __m128i zero = _mm_setzero_si128();
277 const __m128i max = _mm_set1_epi16(255);
278 const __m128i r_cr =
279 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
280 const __m128i g_cb =
281 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
282 const __m128i g_cr =
283 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
284 const __m128i b_cb =
285 _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
286 const __m128i c4096 = _mm_set1_epi16(4096);
287 const INT16* y_buf = pSrc[0];
288 const INT16* cb_buf = pSrc[1];
289 const INT16* cr_buf = pSrc[2];
290 const UINT32 pad = roi->width % 16;
291 const UINT32 step = sizeof(__m128i) / sizeof(INT16);
292 const size_t imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
293 BYTE* d_buf = pDst;
294 const size_t dstPad = (dstStep - roi->width * 4);
295
296 mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height);
297 mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height);
298 mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height);
299
300 for (UINT32 yp = 0; yp < roi->height; ++yp)
301 {
302 for (size_t i = 0; i < imax; i += 2)
303 {
304 /* In order to use SSE2 signed 16-bit integer multiplication
305 * we need to convert the floating point factors to signed int
306 * without losing information.
307 * The result of this multiplication is 32 bit and we have two
308 * SSE instructions that return either the hi or lo word.
309 * Thus we will multiply the factors by the highest possible 2^n,
310 * take the upper 16 bits of the signed 32-bit result
311 * (_mm_mulhi_epi16) and correct this result by multiplying
312 * it by 2^(16-n).
313 *
314 * For the given factors in the conversion matrix the best
315 * possible n is 14.
316 *
317 * Example for calculating r:
318 * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
319 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
320 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
321 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
322 */
323 /* y = (y_r_buf[i] + 4096) >> 2 */
324 __m128i y1 = LOAD_SI128(y_buf);
325 y_buf += step;
326 y1 = _mm_add_epi16(y1, c4096);
327 y1 = _mm_srai_epi16(y1, 2);
328 /* cb = cb_g_buf[i]; */
329 __m128i cb1 = LOAD_SI128(cb_buf);
330 cb_buf += step;
331 /* cr = cr_b_buf[i]; */
332 __m128i cr1 = LOAD_SI128(cr_buf);
333 cr_buf += step;
334 /* (y + HIWORD(cr*22986)) >> 3 */
335 __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
336 r1 = _mm_srai_epi16(r1, 3);
337 /* r_buf[i] = CLIP(r); */
338 mm_between_epi16(r1, zero, max);
339 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
340 __m128i g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
341 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
342 g1 = _mm_srai_epi16(g1, 3);
343 /* g_buf[i] = CLIP(g); */
344 mm_between_epi16(g1, zero, max);
345 /* (y + HIWORD(cb*28999)) >> 3 */
346 __m128i b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
347 b1 = _mm_srai_epi16(b1, 3);
348 /* b_buf[i] = CLIP(b); */
349 mm_between_epi16(b1, zero, max);
350 __m128i y2 = LOAD_SI128(y_buf);
351 y_buf += step;
352 y2 = _mm_add_epi16(y2, c4096);
353 y2 = _mm_srai_epi16(y2, 2);
354 /* cb = cb_g_buf[i]; */
355 __m128i cb2 = LOAD_SI128(cb_buf);
356 cb_buf += step;
357 /* cr = cr_b_buf[i]; */
358 __m128i cr2 = LOAD_SI128(cr_buf);
359 cr_buf += step;
360 /* (y + HIWORD(cr*22986)) >> 3 */
361 __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
362 r2 = _mm_srai_epi16(r2, 3);
363 /* r_buf[i] = CLIP(r); */
364 mm_between_epi16(r2, zero, max);
365 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
366 __m128i g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
367 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
368 g2 = _mm_srai_epi16(g2, 3);
369 /* g_buf[i] = CLIP(g); */
370 mm_between_epi16(g2, zero, max);
371 /* (y + HIWORD(cb*28999)) >> 3 */
372 __m128i b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
373 b2 = _mm_srai_epi16(b2, 3);
374 /* b_buf[i] = CLIP(b); */
375 mm_between_epi16(b2, zero, max);
376 {
377 /* The comments below pretend these are 8-byte registers
378 * rather than 16-byte, for readability.
379 */
380 __m128i R0 = r1; /* R0 = 00R300R200R100R0 */
381 __m128i R1 = r2; /* R1 = 00R700R600R500R4 */
382 R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
383 R1 = g1; /* R1 = 00G300G200G100G0 */
384 __m128i R2 = g2; /* R2 = 00G700G600G500G4 */
385 R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
386 R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
387 R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
388 R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
389 R0 = b1; /* R0 = 00B300B200B100B0 */
390 __m128i R3 = b2; /* R3 = 00B700B600B500B4 */
391 R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
392 R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
393 __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
394 R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
395 R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
396 R0 = R4; /* R0 = R4 */
397 R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
398 R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
399 R2 = R3; /* R2 = R3 */
400 R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
401 R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
402 STORE_SI128(d_buf, R0); /* R1G1B1FFR0G0B0FF */
403 d_buf += sizeof(__m128i);
404 STORE_SI128(d_buf, R4); /* R3G3B3FFR2G2B2FF */
405 d_buf += sizeof(__m128i);
406 STORE_SI128(d_buf, R2); /* R5G5B5FFR4G4B4FF */
407 d_buf += sizeof(__m128i);
408 STORE_SI128(d_buf, R3); /* R7G7B7FFR6G6B6FF */
409 d_buf += sizeof(__m128i);
410 }
411 }
412
413 for (UINT32 i = 0; i < pad; i++)
414 {
415 const INT32 divisor = 16;
416 const INT32 Y = ((*y_buf++) + 4096) << divisor;
417 const INT32 Cb = (*cb_buf++);
418 const INT32 Cr = (*cr_buf++);
419 const INT32 CrR = Cr * ycbcr_table[divisor][0];
420 const INT32 CrG = Cr * ycbcr_table[divisor][1];
421 const INT32 CbG = Cb * ycbcr_table[divisor][2];
422 const INT32 CbB = Cb * ycbcr_table[divisor][3];
423 const INT16 R = WINPR_ASSERTING_INT_CAST(int16_t, (((CrR + Y) >> divisor) >> 5));
424 const INT16 G = WINPR_ASSERTING_INT_CAST(int16_t, (((Y - CbG - CrG) >> divisor) >> 5));
425 const INT16 B = WINPR_ASSERTING_INT_CAST(int16_t, (((CbB + Y) >> divisor) >> 5));
426 *d_buf++ = CLIP(R);
427 *d_buf++ = CLIP(G);
428 *d_buf++ = CLIP(B);
429 *d_buf++ = 0xFF;
430 }
431
432 d_buf += dstPad;
433 }
434
435 return PRIMITIVES_SUCCESS;
436}
437
438static pstatus_t
439sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
440 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
441 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
442{
443 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
444 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
445 (dstStep & 0x0f))
446 {
447 /* We can't maintain 16-byte alignment. */
448 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
449 }
450
451 switch (DstFormat)
452 {
453 case PIXEL_FORMAT_BGRA32:
454 case PIXEL_FORMAT_BGRX32:
455 return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
456
457 case PIXEL_FORMAT_RGBA32:
458 case PIXEL_FORMAT_RGBX32:
459 return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
460
461 default:
462 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
463 }
464}
465/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
466 * numbers. See the general code above.
467 */
468static pstatus_t
469sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
470 INT16* WINPR_RESTRICT pDst[3], int dstStep,
471 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
472{
473 const __m128i* r_buf = (const __m128i*)(pSrc[0]);
474 const __m128i* g_buf = (const __m128i*)(pSrc[1]);
475 const __m128i* b_buf = (const __m128i*)(pSrc[2]);
476 __m128i* y_buf = (__m128i*)(pDst[0]);
477 __m128i* cb_buf = (__m128i*)(pDst[1]);
478 __m128i* cr_buf = (__m128i*)(pDst[2]);
479
480 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
481 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
482 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
483 (srcStep & 127) || (dstStep & 127))
484 {
485 /* We can't maintain 16-byte alignment. */
486 return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
487 }
488
489 const __m128i min = _mm_set1_epi16(-128 * 32);
490 const __m128i max = _mm_set1_epi16(127 * 32);
491
492 __m128i y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
493 __m128i y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
494 __m128i y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
495 __m128i cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
496 __m128i cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
497 __m128i cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
498 __m128i cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
499 __m128i cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
500 __m128i cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
501 const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i);
502 const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i);
503
504 mm_prefetch_buffer(r_buf, roi->width, (size_t)srcStep, roi->height);
505 mm_prefetch_buffer(g_buf, roi->width, (size_t)srcStep, roi->height);
506 mm_prefetch_buffer(b_buf, roi->width, (size_t)srcStep, roi->height);
507
508 const size_t imax = roi->width * sizeof(INT16) / sizeof(__m128i);
509
510 for (UINT32 yp = 0; yp < roi->height; ++yp)
511 {
512 for (size_t i = 0; i < imax; i++)
513 {
514 /* In order to use SSE2 signed 16-bit integer multiplication we
515 * need to convert the floating point factors to signed int
516 * without losing information. The result of this multiplication
517 * is 32 bit and using SSE2 we get either the product's hi or lo
518 * word. Thus we will multiply the factors by the highest
519 * possible 2^n and take the upper 16 bits of the signed 32-bit
520 * result (_mm_mulhi_epi16). Since the final result needs to
521 * be scaled by << 5 and also in in order to keep the precision
522 * within the upper 16 bits we will also have to scale the RGB
523 * values used in the multiplication by << 5+(16-n).
524 */
525 __m128i r = LOAD_SI128(r_buf + i);
526 __m128i g = LOAD_SI128(g_buf + i);
527 __m128i b = LOAD_SI128(b_buf + i);
528 /* r<<6; g<<6; b<<6 */
529 r = _mm_slli_epi16(r, 6);
530 g = _mm_slli_epi16(g, 6);
531 b = _mm_slli_epi16(b, 6);
532 /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
533 __m128i y = _mm_mulhi_epi16(r, y_r);
534 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
535 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
536 y = _mm_add_epi16(y, min);
537 /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
538 mm_between_epi16(y, min, max);
539 STORE_SI128(y_buf + i, y);
540 /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
541 __m128i cb = _mm_mulhi_epi16(r, cb_r);
542 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
543 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
544 /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
545 mm_between_epi16(cb, min, max);
546 STORE_SI128(cb_buf + i, cb);
547 /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
548 __m128i cr = _mm_mulhi_epi16(r, cr_r);
549 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
550 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
551 /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
552 mm_between_epi16(cr, min, max);
553 STORE_SI128(cr_buf + i, cr);
554 }
555
556 y_buf += srcbump;
557 cb_buf += srcbump;
558 cr_buf += srcbump;
559 r_buf += dstbump;
560 g_buf += dstbump;
561 b_buf += dstbump;
562 }
563
564 return PRIMITIVES_SUCCESS;
565}
566
567/*---------------------------------------------------------------------------*/
568static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
569 const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
570 UINT32 srcStep, /* bytes between rows in source data */
571 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
572 UINT32 dstStep, /* bytes between rows in dest data */
573 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
574{
575 const UINT16* pr = (const UINT16*)(pSrc[0]);
576 const UINT16* pg = (const UINT16*)(pSrc[1]);
577 const UINT16* pb = (const UINT16*)(pSrc[2]);
578 const UINT32 pad = roi->width % 16;
579 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
580 BYTE* out = NULL;
581 UINT32 srcbump = 0;
582 UINT32 dstbump = 0;
583 out = pDst;
584 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
585 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
586
587 for (UINT32 y = 0; y < roi->height; ++y)
588 {
589 for (UINT32 x = 0; x < roi->width - pad; x += 16)
590 {
591 __m128i r;
592 __m128i g;
593 __m128i b;
594 /* The comments below pretend these are 8-byte registers
595 * rather than 16-byte, for readability.
596 */
597 {
598 __m128i R0;
599 __m128i R1;
600 R0 = LOAD_SI128(pb);
601 pb += 8; /* R0 = 00B300B200B100B0 */
602 R1 = LOAD_SI128(pb);
603 pb += 8; /* R1 = 00B700B600B500B4 */
604 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
605 }
606 {
607 __m128i R0;
608 __m128i R1;
609 R0 = LOAD_SI128(pg);
610 pg += 8; /* R1 = 00G300G200G100G0 */
611 R1 = LOAD_SI128(pg);
612 pg += 8; /* R2 = 00G700G600G500G4 */
613 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
614 }
615 {
616 __m128i R0;
617 __m128i R1;
618 R0 = LOAD_SI128(pr);
619 pr += 8; /* R0 = 00R300R200R100R0 */
620 R1 = LOAD_SI128(pr);
621 pr += 8; /* R3 = 00R700R600R500R4 */
622 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
623 }
624 {
625 const __m128i gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
626 const __m128i gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
627 const __m128i arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
628 const __m128i arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
629
630 {
631 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
632 STORE_SI128(out, bgrx);
633 out += 16; /* FFR1G1B1FFR0G0B0 */
634 }
635 {
636 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
637 STORE_SI128(out, bgrx);
638 out += 16; /* FFR3G3B3FFR2G2B2 */
639 }
640 {
641 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
642 STORE_SI128(out, bgrx);
643 out += 16; /* FFR5G5B5FFR4G4B4 */
644 }
645 {
646 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
647 STORE_SI128(out, bgrx);
648 out += 16; /* FFR7G7B7FFR6G6B6 */
649 }
650 }
651 }
652
653 for (UINT32 x = 0; x < pad; x++)
654 {
655 const BYTE R = CLIP(*pr++);
656 const BYTE G = CLIP(*pg++);
657 const BYTE B = CLIP(*pb++);
658 *out++ = B;
659 *out++ = G;
660 *out++ = R;
661 *out++ = 0xFF;
662 }
663
664 /* Jump to next row. */
665 pr += srcbump;
666 pg += srcbump;
667 pb += srcbump;
668 out += dstbump;
669 }
670
671 return PRIMITIVES_SUCCESS;
672}
673
674static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
675 const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
676 UINT32 srcStep, /* bytes between rows in source data */
677 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
678 UINT32 dstStep, /* bytes between rows in dest data */
679 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
680{
681 const UINT16* pr = (const UINT16*)(pSrc[0]);
682 const UINT16* pg = (const UINT16*)(pSrc[1]);
683 const UINT16* pb = (const UINT16*)(pSrc[2]);
684 const UINT32 pad = roi->width % 16;
685 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
686 BYTE* out = NULL;
687 UINT32 srcbump = 0;
688 UINT32 dstbump = 0;
689 out = pDst;
690 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
691 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
692
693 for (UINT32 y = 0; y < roi->height; ++y)
694 {
695 for (UINT32 x = 0; x < roi->width - pad; x += 16)
696 {
697 __m128i r;
698 __m128i g;
699 __m128i b;
700 /* The comments below pretend these are 8-byte registers
701 * rather than 16-byte, for readability.
702 */
703 {
704 __m128i R0;
705 __m128i R1;
706 R0 = LOAD_SI128(pb);
707 pb += 8; /* R0 = 00B300B200B100B0 */
708 R1 = LOAD_SI128(pb);
709 pb += 8; /* R1 = 00B700B600B500B4 */
710 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
711 }
712 {
713 __m128i R0;
714 __m128i R1;
715 R0 = LOAD_SI128(pg);
716 pg += 8; /* R1 = 00G300G200G100G0 */
717 R1 = LOAD_SI128(pg);
718 pg += 8; /* R2 = 00G700G600G500G4 */
719 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
720 }
721 {
722 __m128i R0;
723 __m128i R1;
724 R0 = LOAD_SI128(pr);
725 pr += 8; /* R0 = 00R300R200R100R0 */
726 R1 = LOAD_SI128(pr);
727 pr += 8; /* R3 = 00R700R600R500R4 */
728 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
729 }
730 {
731 __m128i gbHi;
732 __m128i gbLo;
733 __m128i arHi;
734 __m128i arLo;
735 {
736 gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
737 gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
738 arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
739 arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
740 }
741 {
742 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
743 STORE_SI128(out, bgrx);
744 out += 16; /* FFR1G1B1FFR0G0B0 */
745 }
746 {
747 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
748 STORE_SI128(out, bgrx);
749 out += 16; /* FFR3G3B3FFR2G2B2 */
750 }
751 {
752 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
753 STORE_SI128(out, bgrx);
754 out += 16; /* FFR5G5B5FFR4G4B4 */
755 }
756 {
757 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
758 STORE_SI128(out, bgrx);
759 out += 16; /* FFR7G7B7FFR6G6B6 */
760 }
761 }
762 }
763
764 for (UINT32 x = 0; x < pad; x++)
765 {
766 const BYTE R = CLIP(*pr++);
767 const BYTE G = CLIP(*pg++);
768 const BYTE B = CLIP(*pb++);
769 *out++ = R;
770 *out++ = G;
771 *out++ = B;
772 *out++ = 0xFF;
773 }
774
775 /* Jump to next row. */
776 pr += srcbump;
777 pg += srcbump;
778 pb += srcbump;
779 out += dstbump;
780 }
781
782 return PRIMITIVES_SUCCESS;
783}
784
785static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
786 const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
787 UINT32 srcStep, /* bytes between rows in source data */
788 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
789 UINT32 dstStep, /* bytes between rows in dest data */
790 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
791{
792 const UINT16* pr = (const UINT16*)(pSrc[0]);
793 const UINT16* pg = (const UINT16*)(pSrc[1]);
794 const UINT16* pb = (const UINT16*)(pSrc[2]);
795 const UINT32 pad = roi->width % 16;
796 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
797 BYTE* out = NULL;
798 UINT32 srcbump = 0;
799 UINT32 dstbump = 0;
800 out = pDst;
801 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
802 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
803
804 for (UINT32 y = 0; y < roi->height; ++y)
805 {
806 for (UINT32 x = 0; x < roi->width - pad; x += 16)
807 {
808 __m128i r;
809 __m128i g;
810 __m128i b;
811 /* The comments below pretend these are 8-byte registers
812 * rather than 16-byte, for readability.
813 */
814 {
815 __m128i R0;
816 __m128i R1;
817 R0 = LOAD_SI128(pb);
818 pb += 8; /* R0 = 00B300B200B100B0 */
819 R1 = LOAD_SI128(pb);
820 pb += 8; /* R1 = 00B700B600B500B4 */
821 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
822 }
823 {
824 __m128i R0;
825 __m128i R1;
826 R0 = LOAD_SI128(pg);
827 pg += 8; /* R1 = 00G300G200G100G0 */
828 R1 = LOAD_SI128(pg);
829 pg += 8; /* R2 = 00G700G600G500G4 */
830 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
831 }
832 {
833 __m128i R0;
834 __m128i R1;
835 R0 = LOAD_SI128(pr);
836 pr += 8; /* R0 = 00R300R200R100R0 */
837 R1 = LOAD_SI128(pr);
838 pr += 8; /* R3 = 00R700R600R500R4 */
839 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
840 }
841 {
842 __m128i gbHi;
843 __m128i gbLo;
844 __m128i arHi;
845 __m128i arLo;
846 {
847 gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
848 gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
849 arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
850 arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
851 }
852 {
853 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
854 STORE_SI128(out, bgrx);
855 out += 16; /* FFR1G1B1FFR0G0B0 */
856 }
857 {
858 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
859 STORE_SI128(out, bgrx);
860 out += 16; /* FFR3G3B3FFR2G2B2 */
861 }
862 {
863 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
864 STORE_SI128(out, bgrx);
865 out += 16; /* FFR5G5B5FFR4G4B4 */
866 }
867 {
868 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
869 STORE_SI128(out, bgrx);
870 out += 16; /* FFR7G7B7FFR6G6B6 */
871 }
872 }
873 }
874
875 for (UINT32 x = 0; x < pad; x++)
876 {
877 const BYTE R = CLIP(*pr++);
878 const BYTE G = CLIP(*pg++);
879 const BYTE B = CLIP(*pb++);
880 *out++ = 0xFF;
881 *out++ = B;
882 *out++ = G;
883 *out++ = R;
884 }
885
886 /* Jump to next row. */
887 pr += srcbump;
888 pg += srcbump;
889 pb += srcbump;
890 out += dstbump;
891 }
892
893 return PRIMITIVES_SUCCESS;
894}
895
896static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
897 const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
898 UINT32 srcStep, /* bytes between rows in source data */
899 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
900 UINT32 dstStep, /* bytes between rows in dest data */
901 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
902{
903 const UINT16* pr = (const UINT16*)(pSrc[0]);
904 const UINT16* pg = (const UINT16*)(pSrc[1]);
905 const UINT16* pb = (const UINT16*)(pSrc[2]);
906 const __m128i a = mm_set1_epu32(0xFFFFFFFFU);
907 const UINT32 pad = roi->width % 16;
908 BYTE* out = NULL;
909 UINT32 srcbump = 0;
910 UINT32 dstbump = 0;
911 out = pDst;
912 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
913 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
914
915 for (UINT32 y = 0; y < roi->height; ++y)
916 {
917 for (UINT32 x = 0; x < roi->width - pad; x += 16)
918 {
919 __m128i r;
920 __m128i g;
921 __m128i b;
922 /* The comments below pretend these are 8-byte registers
923 * rather than 16-byte, for readability.
924 */
925 {
926 __m128i R0;
927 __m128i R1;
928 R0 = LOAD_SI128(pb);
929 pb += 8; /* R0 = 00B300B200B100B0 */
930 R1 = LOAD_SI128(pb);
931 pb += 8; /* R1 = 00B700B600B500B4 */
932 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
933 }
934 {
935 __m128i R0;
936 __m128i R1;
937 R0 = LOAD_SI128(pg);
938 pg += 8; /* R1 = 00G300G200G100G0 */
939 R1 = LOAD_SI128(pg);
940 pg += 8; /* R2 = 00G700G600G500G4 */
941 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
942 }
943 {
944 __m128i R0;
945 __m128i R1;
946 R0 = LOAD_SI128(pr);
947 pr += 8; /* R0 = 00R300R200R100R0 */
948 R1 = LOAD_SI128(pr);
949 pr += 8; /* R3 = 00R700R600R500R4 */
950 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
951 }
952 {
953 __m128i gbHi;
954 __m128i gbLo;
955 __m128i arHi;
956 __m128i arLo;
957 {
958 gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
959 gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
960 arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
961 arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
962 }
963 {
964 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
965 STORE_SI128(out, bgrx);
966 out += 16; /* FFR1G1B1FFR0G0B0 */
967 }
968 {
969 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
970 STORE_SI128(out, bgrx);
971 out += 16; /* FFR3G3B3FFR2G2B2 */
972 }
973 {
974 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
975 STORE_SI128(out, bgrx);
976 out += 16; /* FFR5G5B5FFR4G4B4 */
977 }
978 {
979 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
980 STORE_SI128(out, bgrx);
981 out += 16; /* FFR7G7B7FFR6G6B6 */
982 }
983 }
984 }
985
986 for (UINT32 x = 0; x < pad; x++)
987 {
988 const BYTE R = CLIP(*pr++);
989 const BYTE G = CLIP(*pg++);
990 const BYTE B = CLIP(*pb++);
991 *out++ = 0xFF;
992 *out++ = R;
993 *out++ = G;
994 *out++ = B;
995 }
996
997 /* Jump to next row. */
998 pr += srcbump;
999 pg += srcbump;
1000 pb += srcbump;
1001 out += dstbump;
1002 }
1003
1004 return PRIMITIVES_SUCCESS;
1005}
1006
1007static pstatus_t
1008sse2_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
1009 UINT32 srcStep, /* bytes between rows in source data */
1010 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1011 UINT32 dstStep, /* bytes between rows in dest data */
1012 UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi)
1013{
1014 if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1015 (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1016 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1017
1018 switch (DstFormat)
1019 {
1020 case PIXEL_FORMAT_BGRA32:
1021 case PIXEL_FORMAT_BGRX32:
1022 return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1023
1024 case PIXEL_FORMAT_RGBA32:
1025 case PIXEL_FORMAT_RGBX32:
1026 return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1027
1028 case PIXEL_FORMAT_ABGR32:
1029 case PIXEL_FORMAT_XBGR32:
1030 return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1031
1032 case PIXEL_FORMAT_ARGB32:
1033 case PIXEL_FORMAT_XRGB32:
1034 return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1035
1036 default:
1037 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1038 }
1039}
1040#endif
1041
1042void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims)
1043{
1044#if defined(SSE_AVX_INTRINSICS_ENABLED)
1045 generic = primitives_get_generic();
1046
1047 WLog_VRB(PRIM_TAG, "SSE2 optimizations");
1048 prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1049 prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1050 prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1051
1052#else
1053 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
1054 WINPR_UNUSED(prims);
1055#endif
1056}