FreeRDP
Loading...
Searching...
No Matches
nsc_sse2.c
1
20#include <winpr/assert.h>
21#include <winpr/cast.h>
22#include <winpr/platform.h>
23#include <freerdp/config.h>
24
25#include "../nsc_types.h"
26#include "nsc_sse2.h"
27
28#include "../../core/simd.h"
29#include "../../primitives/sse/prim_avxsse.h"
30
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#include <xmmintrin.h>
37#include <emmintrin.h>
38
39#include <freerdp/codec/color.h>
40#include <winpr/crt.h>
41#include <winpr/sysinfo.h>
42
43static inline size_t nsc_encode_next_rgba(UINT32 format, const BYTE* src, const BYTE* palette,
44 __m128i* r_val, __m128i* g_val, __m128i* b_val,
45 __m128i* a_val)
46{
47 switch (format)
48 {
49 case PIXEL_FORMAT_BGRX32:
50 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
51 *(src + 8), *(src + 4), *src);
52 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
53 *(src + 9), *(src + 5), *(src + 1));
54 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
55 *(src + 10), *(src + 6), *(src + 2));
56 *a_val = _mm_set1_epi16(0xFF);
57 return 32;
58
59 case PIXEL_FORMAT_BGRA32:
60 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
61 *(src + 8), *(src + 4), *src);
62 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
63 *(src + 9), *(src + 5), *(src + 1));
64 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
65 *(src + 10), *(src + 6), *(src + 2));
66 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
67 *(src + 11), *(src + 7), *(src + 3));
68 return 32;
69
70 case PIXEL_FORMAT_RGBX32:
71 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
72 *(src + 8), *(src + 4), *src);
73 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
74 *(src + 9), *(src + 5), *(src + 1));
75 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
76 *(src + 10), *(src + 6), *(src + 2));
77 *a_val = _mm_set1_epi16(0xFF);
78 return 32;
79
80 case PIXEL_FORMAT_RGBA32:
81 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
82 *(src + 8), *(src + 4), *src);
83 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
84 *(src + 9), *(src + 5), *(src + 1));
85 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
86 *(src + 10), *(src + 6), *(src + 2));
87 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
88 *(src + 11), *(src + 7), *(src + 3));
89 return 32;
90
91 case PIXEL_FORMAT_BGR24:
92 *b_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
93 *(src + 6), *(src + 3), *src);
94 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
95 *(src + 7), *(src + 4), *(src + 1));
96 *r_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
97 *(src + 8), *(src + 5), *(src + 2));
98 *a_val = _mm_set1_epi16(0xFF);
99 return 24;
100
101 case PIXEL_FORMAT_RGB24:
102 *r_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
103 *(src + 6), *(src + 3), *src);
104 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
105 *(src + 7), *(src + 4), *(src + 1));
106 *b_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
107 *(src + 8), *(src + 5), *(src + 2));
108 *a_val = _mm_set1_epi16(0xFF);
109 return 24;
110
111 case PIXEL_FORMAT_BGR16:
112 *b_val = _mm_set_epi16(
113 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 15)) & 0xF8) | ((*(src + 15)) >> 5)),
114 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 13)) & 0xF8) | ((*(src + 13)) >> 5)),
115 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 11)) & 0xF8) | ((*(src + 11)) >> 5)),
116 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 9)) & 0xF8) | ((*(src + 9)) >> 5)),
117 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 7)) & 0xF8) | ((*(src + 7)) >> 5)),
118 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 5)) & 0xF8) | ((*(src + 5)) >> 5)),
119 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 3)) & 0xF8) | ((*(src + 3)) >> 5)),
120 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 1)) & 0xF8) | ((*(src + 1)) >> 5)));
121 *g_val =
122 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 15)) & 0x07) << 5) |
123 (((*(src + 14)) & 0xE0) >> 3)),
124 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 13)) & 0x07) << 5) |
125 (((*(src + 12)) & 0xE0) >> 3)),
126 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 11)) & 0x07) << 5) |
127 (((*(src + 10)) & 0xE0) >> 3)),
128 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 9)) & 0x07) << 5) |
129 (((*(src + 8)) & 0xE0) >> 3)),
130 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 7)) & 0x07) << 5) |
131 (((*(src + 6)) & 0xE0) >> 3)),
132 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 5)) & 0x07) << 5) |
133 (((*(src + 4)) & 0xE0) >> 3)),
134 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 3)) & 0x07) << 5) |
135 (((*(src + 2)) & 0xE0) >> 3)),
136 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) |
137 (((*src) & 0xE0) >> 3)));
138 *r_val = _mm_set_epi16(
139 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 14)) & 0x1F) << 3) |
140 (((*(src + 14)) >> 2) & 0x07)),
141 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 12)) & 0x1F) << 3) |
142 (((*(src + 12)) >> 2) & 0x07)),
143 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 10)) & 0x1F) << 3) |
144 (((*(src + 10)) >> 2) & 0x07)),
145 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 8)) & 0x1F) << 3) |
146 (((*(src + 8)) >> 2) & 0x07)),
147 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 6)) & 0x1F) << 3) |
148 (((*(src + 6)) >> 2) & 0x07)),
149 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 4)) & 0x1F) << 3) |
150 (((*(src + 4)) >> 2) & 0x07)),
151 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 2)) & 0x1F) << 3) |
152 (((*(src + 2)) >> 2) & 0x07)),
153 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
154 *a_val = _mm_set1_epi16(0xFF);
155 return 16;
156
157 case PIXEL_FORMAT_RGB16:
158 *r_val =
159 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, ((src[15] & 0xF8) | (src[15] >> 5))),
160 WINPR_ASSERTING_INT_CAST(INT16, ((src[13] & 0xF8) | (src[13] >> 5))),
161 WINPR_ASSERTING_INT_CAST(INT16, ((src[11] & 0xF8) | (src[11] >> 5))),
162 WINPR_ASSERTING_INT_CAST(INT16, ((src[9] & 0xF8) | (src[9] >> 5))),
163 WINPR_ASSERTING_INT_CAST(INT16, ((src[7] & 0xF8) | (src[7] >> 5))),
164 WINPR_ASSERTING_INT_CAST(INT16, ((src[5] & 0xF8) | (src[5] >> 5))),
165 WINPR_ASSERTING_INT_CAST(INT16, ((src[3] & 0xF8) | (src[3] >> 5))),
166 WINPR_ASSERTING_INT_CAST(INT16, ((src[1] & 0xF8) | (src[1] >> 5))));
167 *g_val =
168 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 15)) & 0x07) << 5) |
169 (((*(src + 14)) & 0xE0) >> 3)),
170 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 13)) & 0x07) << 5) |
171 (((*(src + 12)) & 0xE0) >> 3)),
172 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 11)) & 0x07) << 5) |
173 (((*(src + 10)) & 0xE0) >> 3)),
174 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 9)) & 0x07) << 5) |
175 (((*(src + 8)) & 0xE0) >> 3)),
176 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 7)) & 0x07) << 5) |
177 (((*(src + 6)) & 0xE0) >> 3)),
178 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 5)) & 0x07) << 5) |
179 (((*(src + 4)) & 0xE0) >> 3)),
180 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 3)) & 0x07) << 5) |
181 (((*(src + 2)) & 0xE0) >> 3)),
182 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) |
183 (((*src) & 0xE0) >> 3)));
184 *b_val = _mm_set_epi16(
185 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 14)) & 0x1F) << 3) |
186 (((*(src + 14)) >> 2) & 0x07)),
187 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 12)) & 0x1F) << 3) |
188 (((*(src + 12)) >> 2) & 0x07)),
189 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 10)) & 0x1F) << 3) |
190 (((*(src + 10)) >> 2) & 0x07)),
191 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 8)) & 0x1F) << 3) |
192 (((*(src + 8)) >> 2) & 0x07)),
193 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 6)) & 0x1F) << 3) |
194 (((*(src + 6)) >> 2) & 0x07)),
195 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 4)) & 0x1F) << 3) |
196 (((*(src + 4)) >> 2) & 0x07)),
197 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 2)) & 0x1F) << 3) |
198 (((*(src + 2)) >> 2) & 0x07)),
199 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
200 *a_val = _mm_set1_epi16(0xFF);
201 return 16;
202
203 case PIXEL_FORMAT_A4:
204 {
205 BYTE idx[8] = { 0 };
206
207 for (int shift = 7; shift >= 0; shift--)
208 {
209 idx[shift] = ((*src) >> shift) & 1;
210 idx[shift] |= (((*(src + 1)) >> shift) & 1) << 1;
211 idx[shift] |= (((*(src + 2)) >> shift) & 1) << 2;
212 idx[shift] |= (((*(src + 3)) >> shift) & 1) << 3;
213 idx[shift] *= 3;
214 }
215
216 *r_val =
217 _mm_set_epi16(palette[idx[0]], palette[idx[1]], palette[idx[2]], palette[idx[3]],
218 palette[idx[4]], palette[idx[5]], palette[idx[6]], palette[idx[7]]);
219 *g_val = _mm_set_epi16(palette[idx[0] + 1], palette[idx[1] + 1], palette[idx[2] + 1],
220 palette[idx[3] + 1], palette[idx[4] + 1], palette[idx[5] + 1],
221 palette[idx[6] + 1], palette[idx[7] + 1]);
222 *b_val = _mm_set_epi16(palette[idx[0] + 2], palette[idx[1] + 2], palette[idx[2] + 2],
223 palette[idx[3] + 2], palette[idx[4] + 2], palette[idx[5] + 2],
224 palette[idx[6] + 2], palette[idx[7] + 2]);
225 *a_val = _mm_set1_epi16(0xFF);
226 return 4;
227 }
228
229 case PIXEL_FORMAT_RGB8:
230 {
231 *r_val = _mm_set_epi16(palette[(*(src + 7ULL)) * 3ULL], palette[(*(src + 6ULL)) * 3ULL],
232 palette[(*(src + 5ULL)) * 3ULL], palette[(*(src + 4ULL)) * 3ULL],
233 palette[(*(src + 3ULL)) * 3ULL], palette[(*(src + 2ULL)) * 3ULL],
234 palette[(*(src + 1ULL)) * 3ULL], palette[(*src) * 3ULL]);
235 *g_val = _mm_set_epi16(
236 palette[(*(src + 7ULL)) * 3ULL + 1ULL], palette[(*(src + 6ULL)) * 3ULL + 1ULL],
237 palette[(*(src + 5ULL)) * 3ULL + 1ULL], palette[(*(src + 4ULL)) * 3ULL + 1ULL],
238 palette[(*(src + 3ULL)) * 3ULL + 1ULL], palette[(*(src + 2ULL)) * 3ULL + 1ULL],
239 palette[(*(src + 1ULL)) * 3ULL + 1ULL], palette[(*src) * 3ULL + 1ULL]);
240 *b_val = _mm_set_epi16(
241 palette[(*(src + 7ULL)) * 3ULL + 2ULL], palette[(*(src + 6ULL)) * 3ULL + 2ULL],
242 palette[(*(src + 5ULL)) * 3ULL + 2ULL], palette[(*(src + 4ULL)) * 3ULL + 2ULL],
243 palette[(*(src + 3ULL)) * 3ULL + 2ULL], palette[(*(src + 2ULL)) * 3ULL + 2ULL],
244 palette[(*(src + 1ULL)) * 3ULL + 2ULL], palette[(*src) * 3ULL + 2ULL]);
245 *a_val = _mm_set1_epi16(0xFF);
246 return 8;
247 }
248
249 default:
250 return 0;
251 }
252}
253
254static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, const BYTE* data, UINT32 scanline)
255{
256 size_t y = 0;
257
258 if (!context || !data || (scanline == 0))
259 return FALSE;
260
261 const UINT16 tempWidth = ROUND_UP_TO(context->width, 8);
262 const UINT16 rw = (context->ChromaSubsamplingLevel > 0 ? tempWidth : context->width);
263
264 const BYTE ccl = WINPR_ASSERTING_INT_CAST(BYTE, context->ColorLossLevel);
265
266 for (; y < context->height; y++)
267 {
268 const BYTE* src = data + (context->height - 1 - y) * scanline;
269 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
270 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
271 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
272 BYTE* aplane = context->priv->PlaneBuffers[3] + y * context->width;
273
274 for (UINT16 x = 0; x < context->width; x += 8)
275 {
276 __m128i r_val = { 0 };
277 __m128i g_val = { 0 };
278 __m128i b_val = { 0 };
279 __m128i a_val = { 0 };
280
281 const size_t rc = nsc_encode_next_rgba(context->format, src, context->palette, &r_val,
282 &g_val, &b_val, &a_val);
283 src += rc;
284
285 __m128i y_val = _mm_srai_epi16(r_val, 2);
286 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(g_val, 1));
287 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(b_val, 2));
288 __m128i co_val = _mm_sub_epi16(r_val, b_val);
289 co_val = _mm_srai_epi16(co_val, ccl);
290 __m128i cg_val = _mm_sub_epi16(g_val, _mm_srai_epi16(r_val, 1));
291 cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1));
292 cg_val = _mm_srai_epi16(cg_val, ccl);
293 y_val = _mm_packus_epi16(y_val, y_val);
294 STORE_SI128(yplane, y_val);
295 co_val = _mm_packs_epi16(co_val, co_val);
296 STORE_SI128(coplane, co_val);
297 cg_val = _mm_packs_epi16(cg_val, cg_val);
298 STORE_SI128(cgplane, cg_val);
299 a_val = _mm_packus_epi16(a_val, a_val);
300 STORE_SI128(aplane, a_val);
301 yplane += 8;
302 coplane += 8;
303 cgplane += 8;
304 aplane += 8;
305 }
306
307 if (context->ChromaSubsamplingLevel > 0 && (context->width % 2) == 1)
308 {
309 context->priv->PlaneBuffers[0][y * rw + context->width] =
310 context->priv->PlaneBuffers[0][y * rw + context->width - 1];
311 context->priv->PlaneBuffers[1][y * rw + context->width] =
312 context->priv->PlaneBuffers[1][y * rw + context->width - 1];
313 context->priv->PlaneBuffers[2][y * rw + context->width] =
314 context->priv->PlaneBuffers[2][y * rw + context->width - 1];
315 }
316 }
317
318 if (context->ChromaSubsamplingLevel > 0 && (y % 2) == 1)
319 {
320 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
321 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
322 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
323 CopyMemory(yplane, yplane - rw, rw);
324 CopyMemory(coplane, coplane - rw, rw);
325 CopyMemory(cgplane, cgplane - rw, rw);
326 }
327
328 return TRUE;
329}
330
331static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context)
332{
333 BYTE* co_dst = NULL;
334 BYTE* cg_dst = NULL;
335 INT8* co_src0 = NULL;
336 INT8* co_src1 = NULL;
337 INT8* cg_src0 = NULL;
338 INT8* cg_src1 = NULL;
339 UINT32 tempWidth = 0;
340 UINT32 tempHeight = 0;
341 __m128i t;
342 __m128i val;
343 __m128i mask = _mm_set1_epi16(0xFF);
344 tempWidth = ROUND_UP_TO(context->width, 8);
345 tempHeight = ROUND_UP_TO(context->height, 2);
346
347 for (size_t y = 0; y < tempHeight >> 1; y++)
348 {
349 co_dst = context->priv->PlaneBuffers[1] + y * (tempWidth >> 1);
350 cg_dst = context->priv->PlaneBuffers[2] + y * (tempWidth >> 1);
351 co_src0 = (INT8*)context->priv->PlaneBuffers[1] + (y << 1) * tempWidth;
352 co_src1 = co_src0 + tempWidth;
353 cg_src0 = (INT8*)context->priv->PlaneBuffers[2] + (y << 1) * tempWidth;
354 cg_src1 = cg_src0 + tempWidth;
355
356 for (UINT32 x = 0; x < tempWidth >> 1; x += 8)
357 {
358 t = LOAD_SI128(co_src0);
359 t = _mm_avg_epu8(t, LOAD_SI128(co_src1));
360 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
361 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
362 val = _mm_packus_epi16(val, val);
363 STORE_SI128(co_dst, val);
364 co_dst += 8;
365 co_src0 += 16;
366 co_src1 += 16;
367 t = LOAD_SI128(cg_src0);
368 t = _mm_avg_epu8(t, LOAD_SI128(cg_src1));
369 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
370 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
371 val = _mm_packus_epi16(val, val);
372 STORE_SI128(cg_dst, val);
373 cg_dst += 8;
374 cg_src0 += 16;
375 cg_src1 += 16;
376 }
377 }
378}
379
380static BOOL nsc_encode_sse2(NSC_CONTEXT* WINPR_RESTRICT context, const BYTE* WINPR_RESTRICT data,
381 UINT32 scanline)
382{
383 if (!nsc_encode_argb_to_aycocg_sse2(context, data, scanline))
384 return FALSE;
385
386 if (context->ChromaSubsamplingLevel > 0)
387 nsc_encode_subsampling_sse2(context);
388
389 return TRUE;
390}
391#endif
392
393void nsc_init_sse2_int(NSC_CONTEXT* WINPR_RESTRICT context)
394{
395#if defined(SSE_AVX_INTRINSICS_ENABLED)
396 PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2")
397 context->encode = nsc_encode_sse2;
398#else
399 WINPR_UNUSED(context);
400#endif
401}