FreeRDP
Loading...
Searching...
No Matches
prim_YCoCg_ssse3.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized YCoCg<->RGB conversion operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20#include <freerdp/config.h>
21
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
25
26#include "prim_YCoCg.h"
27
28#include "prim_internal.h"
29#include "prim_templates.h"
30
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
32#include <emmintrin.h>
33#include <tmmintrin.h>
34
35static primitives_t* generic = NULL;
36
37/* ------------------------------------------------------------------------- */
38static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
39 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
40 UINT32 dstStep, UINT32 width, UINT32 height,
41 UINT8 shift, BOOL withAlpha)
42{
43 const BYTE* sptr = pSrc;
44 BYTE* dptr = pDst;
45
46 WINPR_ASSERT(srcStep / sizeof(UINT32) >= width);
47 WINPR_ASSERT(dstStep / sizeof(UINT32) >= width);
48 const size_t sRowBump = srcStep - width * sizeof(UINT32);
49 const size_t dRowBump = dstStep - width * sizeof(UINT32);
50 /* Shift left by "shift" and divide by two is the same as shift
51 * left by "shift-1".
52 */
53 int dataShift = shift - 1;
54 BYTE mask = (BYTE)(0xFFU << dataShift);
55
56 /* Let's say the data is of the form:
57 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
58 * Apply:
59 * |R| | 1 1/2 -1/2 | |y|
60 * |G| = | 1 0 1/2 | * |o|
61 * |B| | 1 -1/2 -1/2 | |g|
62 * where Y is 8-bit unsigned and o & g are 8-bit signed.
63 */
64
65 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
66 {
67 /* Too small, or we'll never hit a 16-byte boundary. Punt. */
68 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
69 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
70 width, height, shift, withAlpha);
71 }
72
73 for (UINT32 h = 0; h < height; h++)
74 {
75 UINT32 w = width;
76
77 while (w >= 8)
78 {
79 __m128i R0;
80 __m128i R1;
81 __m128i R2;
82 __m128i R3;
83 __m128i R4;
84 __m128i R5;
85 __m128i R6;
86 __m128i R7;
87
88 R0 = LOAD_SI128(sptr);
89 sptr += (128 / 8);
90 R1 = LOAD_SI128(sptr);
91 sptr += (128 / 8);
92
93 /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
94 /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
95 /* Shuffle to pack all the like types together. */
96 R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
97 R3 = _mm_shuffle_epi8(R0, R2);
98 R4 = _mm_shuffle_epi8(R1, R2);
99 /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
100 /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
101 R5 = _mm_unpackhi_epi32(R3, R4);
102 R6 = _mm_unpacklo_epi32(R3, R4);
103
104 /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
105 /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
106 /* Save alphas aside */
107 if (withAlpha)
108 R7 = _mm_unpackhi_epi64(R5, R5);
109 else
110 R7 = mm_set1_epu32(0xFFFFFFFFU);
111
112 /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
113 /* Expand Y's from 8-bit unsigned to 16-bit signed. */
114 R1 = mm_set1_epu32(0);
115 R0 = _mm_unpacklo_epi8(R5, R1);
116 /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
117 /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
118 * Note: this must be done before sign-conversion.
119 * Note also there is no slli_epi8, so we have to use a 16-bit
120 * version and then mask.
121 */
122 R6 = _mm_slli_epi16(R6, dataShift);
123 R1 = mm_set1_epu8(mask);
124 R6 = _mm_and_si128(R6, R1);
125 /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
126 /* Expand Co's from 8-bit signed to 16-bit signed */
127 R1 = _mm_unpackhi_epi8(R6, R6);
128 R1 = _mm_srai_epi16(R1, 8);
129 /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
130 /* Expand Cg's form 8-bit signed to 16-bit signed */
131 R2 = _mm_unpacklo_epi8(R6, R6);
132 R2 = _mm_srai_epi16(R2, 8);
133 /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
134 /* Get Y - halfCg and save */
135 R6 = _mm_subs_epi16(R0, R2);
136 /* R = (Y-halfCg) + halfCo */
137 R3 = _mm_adds_epi16(R6, R1);
138 /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
139 /* G = Y + Cg(/2) */
140 R4 = _mm_adds_epi16(R0, R2);
141 /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
142 /* B = (Y-halfCg) - Co(/2) */
143 R5 = _mm_subs_epi16(R6, R1);
144 /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
145 /* Repack R's & B's. */
146 R0 = _mm_packus_epi16(R3, R5);
147 /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
148 /* Repack G's. */
149 R1 = _mm_packus_epi16(R4, R4);
150 /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
151 /* And add the A's. */
152 R1 = _mm_unpackhi_epi64(R1, R7);
153 /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
154 /* Now do interleaving again. */
155 R2 = _mm_unpacklo_epi8(R0, R1);
156 /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
157 R3 = _mm_unpackhi_epi8(R0, R1);
158 /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
159 R4 = _mm_unpacklo_epi16(R2, R3);
160 /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
161 R5 = _mm_unpackhi_epi16(R2, R3);
162 /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
163 STORE_SI128(dptr, R4);
164 dptr += (128 / 8);
165 STORE_SI128(dptr, R5);
166 dptr += (128 / 8);
167 w -= 8;
168 }
169
170 /* Handle any remainder pixels. */
171 if (w > 0)
172 {
173 pstatus_t status = 0;
174 status = generic->YCoCgToRGB_8u_AC4R(
175 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
176 WINPR_ASSERTING_INT_CAST(INT32, dstStep), w, 1, shift, withAlpha);
177
178 if (status != PRIMITIVES_SUCCESS)
179 return status;
180
181 sptr += w * sizeof(UINT32);
182 dptr += w * sizeof(UINT32);
183 }
184
185 sptr += sRowBump;
186 dptr += dRowBump;
187 }
188
189 return PRIMITIVES_SUCCESS;
190}
191
192/* ------------------------------------------------------------------------- */
193static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
194 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
195 UINT32 DstFormat, UINT32 dstStep, UINT32 width,
196 UINT32 height, UINT8 shift, BOOL withAlpha)
197{
198 const BYTE* sptr = pSrc;
199 BYTE* dptr = pDst;
200 size_t sRowBump = srcStep - width * sizeof(UINT32);
201 size_t dRowBump = dstStep - width * sizeof(UINT32);
202 /* Shift left by "shift" and divide by two is the same as shift
203 * left by "shift-1".
204 */
205 int dataShift = shift - 1;
206 BYTE mask = (BYTE)(0xFFU << dataShift);
207
208 /* Let's say the data is of the form:
209 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
210 * Apply:
211 * |R| | 1 1/2 -1/2 | |y|
212 * |G| = | 1 0 1/2 | * |o|
213 * |B| | 1 -1/2 -1/2 | |g|
214 * where Y is 8-bit unsigned and o & g are 8-bit signed.
215 */
216
217 if ((width < 8) || (ULONG_PTR)dptr & 0x03)
218 {
219 /* Too small, or we'll never hit a 16-byte boundary. Punt. */
220 return generic->YCoCgToRGB_8u_AC4R(pSrc, WINPR_ASSERTING_INT_CAST(INT32, srcStep), pDst,
221 DstFormat, WINPR_ASSERTING_INT_CAST(INT32, dstStep),
222 width, height, shift, withAlpha);
223 }
224
225 for (UINT32 h = 0; h < height; h++)
226 {
227 UINT32 w = width;
228
229 while (w >= 8)
230 {
231 __m128i R7;
232
233 /* The faster path, 16-byte aligned load. */
234 __m128i R0 = LOAD_SI128(sptr);
235 sptr += (128 / 8);
236 __m128i R1 = LOAD_SI128(sptr);
237 sptr += (128 / 8);
238
239 /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
240 /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
241 /* Shuffle to pack all the like types together. */
242 __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
243 __m128i R3 = _mm_shuffle_epi8(R0, R2);
244 __m128i R4 = _mm_shuffle_epi8(R1, R2);
245 /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
246 /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
247 __m128i R5 = _mm_unpackhi_epi32(R3, R4);
248 __m128i R6 = _mm_unpacklo_epi32(R3, R4);
249
250 /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
251 /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
252 /* Save alphas aside */
253 if (withAlpha)
254 R7 = _mm_unpackhi_epi64(R5, R5);
255 else
256 R7 = mm_set1_epu32(0xFFFFFFFFU);
257
258 /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
259 /* Expand Y's from 8-bit unsigned to 16-bit signed. */
260 R1 = mm_set1_epu32(0);
261 R0 = _mm_unpacklo_epi8(R5, R1);
262 /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
263 /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
264 * Note: this must be done before sign-conversion.
265 * Note also there is no slli_epi8, so we have to use a 16-bit
266 * version and then mask.
267 */
268 R6 = _mm_slli_epi16(R6, dataShift);
269 R1 = mm_set1_epu8(mask);
270 R6 = _mm_and_si128(R6, R1);
271 /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
272 /* Expand Co's from 8-bit signed to 16-bit signed */
273 R1 = _mm_unpackhi_epi8(R6, R6);
274 R1 = _mm_srai_epi16(R1, 8);
275 /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
276 /* Expand Cg's form 8-bit signed to 16-bit signed */
277 R2 = _mm_unpacklo_epi8(R6, R6);
278 R2 = _mm_srai_epi16(R2, 8);
279 /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
280 /* Get Y - halfCg and save */
281 R6 = _mm_subs_epi16(R0, R2);
282 /* R = (Y-halfCg) + halfCo */
283 R3 = _mm_adds_epi16(R6, R1);
284 /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
285 /* G = Y + Cg(/2) */
286 R4 = _mm_adds_epi16(R0, R2);
287 /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
288 /* B = (Y-halfCg) - Co(/2) */
289 R5 = _mm_subs_epi16(R6, R1);
290 /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
291 /* Repack R's & B's. */
292 /* This line is the only diff between inverted and non-inverted.
293 * Unfortunately, it would be expensive to check "inverted"
294 * every time through this loop.
295 */
296 R0 = _mm_packus_epi16(R5, R3);
297 /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
298 /* Repack G's. */
299 R1 = _mm_packus_epi16(R4, R4);
300 /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
301 /* And add the A's. */
302 R1 = _mm_unpackhi_epi64(R1, R7);
303 /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
304 /* Now do interleaving again. */
305 R2 = _mm_unpacklo_epi8(R0, R1);
306 /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
307 R3 = _mm_unpackhi_epi8(R0, R1);
308 /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
309 R4 = _mm_unpacklo_epi16(R2, R3);
310 /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
311 R5 = _mm_unpackhi_epi16(R2, R3);
312 /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
313 STORE_SI128(dptr, R4);
314 dptr += (128 / 8);
315 STORE_SI128(dptr, R5);
316 dptr += (128 / 8);
317 w -= 8;
318 }
319
320 /* Handle any remainder pixels. */
321 if (w > 0)
322 {
323 pstatus_t status = 0;
324 status = generic->YCoCgToRGB_8u_AC4R(
325 sptr, WINPR_ASSERTING_INT_CAST(INT32, srcStep), dptr, DstFormat,
326 WINPR_ASSERTING_INT_CAST(INT32, dstStep), WINPR_ASSERTING_INT_CAST(UINT32, w), 1,
327 shift, withAlpha);
328
329 if (status != PRIMITIVES_SUCCESS)
330 return status;
331
332 sptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
333 dptr += WINPR_ASSERTING_INT_CAST(UINT32, w) * sizeof(UINT32);
334 }
335
336 sptr += sRowBump;
337 dptr += dRowBump;
338 }
339
340 return PRIMITIVES_SUCCESS;
341}
342
343/* ------------------------------------------------------------------------- */
344static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
345 BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
346 INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
347 BOOL withAlpha)
348{
349 switch (DstFormat)
350 {
351 case PIXEL_FORMAT_BGRX32:
352 case PIXEL_FORMAT_BGRA32:
353 return ssse3_YCoCgRToRGB_8u_AC4R_invert(
354 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
355 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
356
357 case PIXEL_FORMAT_RGBX32:
358 case PIXEL_FORMAT_RGBA32:
359 return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
360 pSrc, WINPR_ASSERTING_INT_CAST(UINT32, srcStep), pDst, DstFormat,
361 WINPR_ASSERTING_INT_CAST(UINT32, dstStep), width, height, shift, withAlpha);
362
363 default:
364 return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
365 height, shift, withAlpha);
366 }
367}
368
369#endif
370
371/* ------------------------------------------------------------------------- */
372void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims)
373{
374#if defined(SSE_AVX_INTRINSICS_ENABLED)
375 generic = primitives_get_generic();
376
377 WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
378 prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
379#else
380 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
381 WINPR_UNUSED(prims);
382#endif
383}