20#include <winpr/assert.h>
21#include <winpr/cast.h>
22#include <winpr/platform.h>
23#include <freerdp/config.h>
25#include "../nsc_types.h"
28#include "../../core/simd.h"
29#include "../../primitives/sse/prim_avxsse.h"
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
39#include <freerdp/codec/color.h>
41#include <winpr/sysinfo.h>
43static inline size_t nsc_encode_next_rgba(UINT32 format,
const BYTE* src,
const BYTE* palette,
44 __m128i* r_val, __m128i* g_val, __m128i* b_val,
49 case PIXEL_FORMAT_BGRX32:
50 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
51 *(src + 8), *(src + 4), *src);
52 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
53 *(src + 9), *(src + 5), *(src + 1));
54 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
55 *(src + 10), *(src + 6), *(src + 2));
56 *a_val = _mm_set1_epi16(0xFF);
59 case PIXEL_FORMAT_BGRA32:
60 *b_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
61 *(src + 8), *(src + 4), *src);
62 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
63 *(src + 9), *(src + 5), *(src + 1));
64 *r_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
65 *(src + 10), *(src + 6), *(src + 2));
66 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
67 *(src + 11), *(src + 7), *(src + 3));
70 case PIXEL_FORMAT_RGBX32:
71 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
72 *(src + 8), *(src + 4), *src);
73 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
74 *(src + 9), *(src + 5), *(src + 1));
75 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
76 *(src + 10), *(src + 6), *(src + 2));
77 *a_val = _mm_set1_epi16(0xFF);
80 case PIXEL_FORMAT_RGBA32:
81 *r_val = _mm_set_epi16(*(src + 28), *(src + 24), *(src + 20), *(src + 16), *(src + 12),
82 *(src + 8), *(src + 4), *src);
83 *g_val = _mm_set_epi16(*(src + 29), *(src + 25), *(src + 21), *(src + 17), *(src + 13),
84 *(src + 9), *(src + 5), *(src + 1));
85 *b_val = _mm_set_epi16(*(src + 30), *(src + 26), *(src + 22), *(src + 18), *(src + 14),
86 *(src + 10), *(src + 6), *(src + 2));
87 *a_val = _mm_set_epi16(*(src + 31), *(src + 27), *(src + 23), *(src + 19), *(src + 15),
88 *(src + 11), *(src + 7), *(src + 3));
91 case PIXEL_FORMAT_BGR24:
92 *b_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
93 *(src + 6), *(src + 3), *src);
94 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
95 *(src + 7), *(src + 4), *(src + 1));
96 *r_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
97 *(src + 8), *(src + 5), *(src + 2));
98 *a_val = _mm_set1_epi16(0xFF);
101 case PIXEL_FORMAT_RGB24:
102 *r_val = _mm_set_epi16(*(src + 21), *(src + 18), *(src + 15), *(src + 12), *(src + 9),
103 *(src + 6), *(src + 3), *src);
104 *g_val = _mm_set_epi16(*(src + 22), *(src + 19), *(src + 16), *(src + 13), *(src + 10),
105 *(src + 7), *(src + 4), *(src + 1));
106 *b_val = _mm_set_epi16(*(src + 23), *(src + 20), *(src + 17), *(src + 14), *(src + 11),
107 *(src + 8), *(src + 5), *(src + 2));
108 *a_val = _mm_set1_epi16(0xFF);
111 case PIXEL_FORMAT_BGR16:
112 *b_val = _mm_set_epi16(
113 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 15)) & 0xF8) | ((*(src + 15)) >> 5)),
114 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 13)) & 0xF8) | ((*(src + 13)) >> 5)),
115 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 11)) & 0xF8) | ((*(src + 11)) >> 5)),
116 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 9)) & 0xF8) | ((*(src + 9)) >> 5)),
117 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 7)) & 0xF8) | ((*(src + 7)) >> 5)),
118 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 5)) & 0xF8) | ((*(src + 5)) >> 5)),
119 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 3)) & 0xF8) | ((*(src + 3)) >> 5)),
120 WINPR_ASSERTING_INT_CAST(INT16, ((*(src + 1)) & 0xF8) | ((*(src + 1)) >> 5)));
122 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 15)) & 0x07) << 5) |
123 (((*(src + 14)) & 0xE0) >> 3)),
124 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 13)) & 0x07) << 5) |
125 (((*(src + 12)) & 0xE0) >> 3)),
126 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 11)) & 0x07) << 5) |
127 (((*(src + 10)) & 0xE0) >> 3)),
128 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 9)) & 0x07) << 5) |
129 (((*(src + 8)) & 0xE0) >> 3)),
130 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 7)) & 0x07) << 5) |
131 (((*(src + 6)) & 0xE0) >> 3)),
132 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 5)) & 0x07) << 5) |
133 (((*(src + 4)) & 0xE0) >> 3)),
134 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 3)) & 0x07) << 5) |
135 (((*(src + 2)) & 0xE0) >> 3)),
136 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) |
137 (((*src) & 0xE0) >> 3)));
138 *r_val = _mm_set_epi16(
139 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 14)) & 0x1F) << 3) |
140 (((*(src + 14)) >> 2) & 0x07)),
141 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 12)) & 0x1F) << 3) |
142 (((*(src + 12)) >> 2) & 0x07)),
143 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 10)) & 0x1F) << 3) |
144 (((*(src + 10)) >> 2) & 0x07)),
145 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 8)) & 0x1F) << 3) |
146 (((*(src + 8)) >> 2) & 0x07)),
147 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 6)) & 0x1F) << 3) |
148 (((*(src + 6)) >> 2) & 0x07)),
149 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 4)) & 0x1F) << 3) |
150 (((*(src + 4)) >> 2) & 0x07)),
151 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 2)) & 0x1F) << 3) |
152 (((*(src + 2)) >> 2) & 0x07)),
153 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
154 *a_val = _mm_set1_epi16(0xFF);
157 case PIXEL_FORMAT_RGB16:
159 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, ((src[15] & 0xF8) | (src[15] >> 5))),
160 WINPR_ASSERTING_INT_CAST(INT16, ((src[13] & 0xF8) | (src[13] >> 5))),
161 WINPR_ASSERTING_INT_CAST(INT16, ((src[11] & 0xF8) | (src[11] >> 5))),
162 WINPR_ASSERTING_INT_CAST(INT16, ((src[9] & 0xF8) | (src[9] >> 5))),
163 WINPR_ASSERTING_INT_CAST(INT16, ((src[7] & 0xF8) | (src[7] >> 5))),
164 WINPR_ASSERTING_INT_CAST(INT16, ((src[5] & 0xF8) | (src[5] >> 5))),
165 WINPR_ASSERTING_INT_CAST(INT16, ((src[3] & 0xF8) | (src[3] >> 5))),
166 WINPR_ASSERTING_INT_CAST(INT16, ((src[1] & 0xF8) | (src[1] >> 5))));
168 _mm_set_epi16(WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 15)) & 0x07) << 5) |
169 (((*(src + 14)) & 0xE0) >> 3)),
170 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 13)) & 0x07) << 5) |
171 (((*(src + 12)) & 0xE0) >> 3)),
172 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 11)) & 0x07) << 5) |
173 (((*(src + 10)) & 0xE0) >> 3)),
174 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 9)) & 0x07) << 5) |
175 (((*(src + 8)) & 0xE0) >> 3)),
176 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 7)) & 0x07) << 5) |
177 (((*(src + 6)) & 0xE0) >> 3)),
178 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 5)) & 0x07) << 5) |
179 (((*(src + 4)) & 0xE0) >> 3)),
180 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 3)) & 0x07) << 5) |
181 (((*(src + 2)) & 0xE0) >> 3)),
182 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 1)) & 0x07) << 5) |
183 (((*src) & 0xE0) >> 3)));
184 *b_val = _mm_set_epi16(
185 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 14)) & 0x1F) << 3) |
186 (((*(src + 14)) >> 2) & 0x07)),
187 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 12)) & 0x1F) << 3) |
188 (((*(src + 12)) >> 2) & 0x07)),
189 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 10)) & 0x1F) << 3) |
190 (((*(src + 10)) >> 2) & 0x07)),
191 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 8)) & 0x1F) << 3) |
192 (((*(src + 8)) >> 2) & 0x07)),
193 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 6)) & 0x1F) << 3) |
194 (((*(src + 6)) >> 2) & 0x07)),
195 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 4)) & 0x1F) << 3) |
196 (((*(src + 4)) >> 2) & 0x07)),
197 WINPR_ASSERTING_INT_CAST(INT16, (((*(src + 2)) & 0x1F) << 3) |
198 (((*(src + 2)) >> 2) & 0x07)),
199 WINPR_ASSERTING_INT_CAST(INT16, (((*src) & 0x1F) << 3) | (((*src) >> 2) & 0x07)));
200 *a_val = _mm_set1_epi16(0xFF);
203 case PIXEL_FORMAT_A4:
207 for (
int shift = 7; shift >= 0; shift--)
209 idx[shift] = ((*src) >> shift) & 1;
210 idx[shift] |= (((*(src + 1)) >> shift) & 1) << 1;
211 idx[shift] |= (((*(src + 2)) >> shift) & 1) << 2;
212 idx[shift] |= (((*(src + 3)) >> shift) & 1) << 3;
217 _mm_set_epi16(palette[idx[0]], palette[idx[1]], palette[idx[2]], palette[idx[3]],
218 palette[idx[4]], palette[idx[5]], palette[idx[6]], palette[idx[7]]);
219 *g_val = _mm_set_epi16(palette[idx[0] + 1], palette[idx[1] + 1], palette[idx[2] + 1],
220 palette[idx[3] + 1], palette[idx[4] + 1], palette[idx[5] + 1],
221 palette[idx[6] + 1], palette[idx[7] + 1]);
222 *b_val = _mm_set_epi16(palette[idx[0] + 2], palette[idx[1] + 2], palette[idx[2] + 2],
223 palette[idx[3] + 2], palette[idx[4] + 2], palette[idx[5] + 2],
224 palette[idx[6] + 2], palette[idx[7] + 2]);
225 *a_val = _mm_set1_epi16(0xFF);
229 case PIXEL_FORMAT_RGB8:
231 *r_val = _mm_set_epi16(palette[(*(src + 7ULL)) * 3ULL], palette[(*(src + 6ULL)) * 3ULL],
232 palette[(*(src + 5ULL)) * 3ULL], palette[(*(src + 4ULL)) * 3ULL],
233 palette[(*(src + 3ULL)) * 3ULL], palette[(*(src + 2ULL)) * 3ULL],
234 palette[(*(src + 1ULL)) * 3ULL], palette[(*src) * 3ULL]);
235 *g_val = _mm_set_epi16(
236 palette[(*(src + 7ULL)) * 3ULL + 1ULL], palette[(*(src + 6ULL)) * 3ULL + 1ULL],
237 palette[(*(src + 5ULL)) * 3ULL + 1ULL], palette[(*(src + 4ULL)) * 3ULL + 1ULL],
238 palette[(*(src + 3ULL)) * 3ULL + 1ULL], palette[(*(src + 2ULL)) * 3ULL + 1ULL],
239 palette[(*(src + 1ULL)) * 3ULL + 1ULL], palette[(*src) * 3ULL + 1ULL]);
240 *b_val = _mm_set_epi16(
241 palette[(*(src + 7ULL)) * 3ULL + 2ULL], palette[(*(src + 6ULL)) * 3ULL + 2ULL],
242 palette[(*(src + 5ULL)) * 3ULL + 2ULL], palette[(*(src + 4ULL)) * 3ULL + 2ULL],
243 palette[(*(src + 3ULL)) * 3ULL + 2ULL], palette[(*(src + 2ULL)) * 3ULL + 2ULL],
244 palette[(*(src + 1ULL)) * 3ULL + 2ULL], palette[(*src) * 3ULL + 2ULL]);
245 *a_val = _mm_set1_epi16(0xFF);
254static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context,
const BYTE* data, UINT32 scanline)
258 if (!context || !data || (scanline == 0))
261 const UINT16 tempWidth = ROUND_UP_TO(context->width, 8);
262 const UINT16 rw = (context->ChromaSubsamplingLevel > 0 ? tempWidth : context->width);
264 const BYTE ccl = WINPR_ASSERTING_INT_CAST(BYTE, context->ColorLossLevel);
266 for (; y < context->height; y++)
268 const BYTE* src = data + (context->height - 1 - y) * scanline;
269 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
270 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
271 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
272 BYTE* aplane = context->priv->PlaneBuffers[3] + y * context->width;
274 for (UINT16 x = 0; x < context->width; x += 8)
276 __m128i r_val = { 0 };
277 __m128i g_val = { 0 };
278 __m128i b_val = { 0 };
279 __m128i a_val = { 0 };
281 const size_t rc = nsc_encode_next_rgba(context->format, src, context->palette, &r_val,
282 &g_val, &b_val, &a_val);
285 __m128i y_val = _mm_srai_epi16(r_val, 2);
286 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(g_val, 1));
287 y_val = _mm_add_epi16(y_val, _mm_srai_epi16(b_val, 2));
288 __m128i co_val = _mm_sub_epi16(r_val, b_val);
289 co_val = _mm_srai_epi16(co_val, ccl);
290 __m128i cg_val = _mm_sub_epi16(g_val, _mm_srai_epi16(r_val, 1));
291 cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1));
292 cg_val = _mm_srai_epi16(cg_val, ccl);
293 y_val = _mm_packus_epi16(y_val, y_val);
294 STORE_SI128(yplane, y_val);
295 co_val = _mm_packs_epi16(co_val, co_val);
296 STORE_SI128(coplane, co_val);
297 cg_val = _mm_packs_epi16(cg_val, cg_val);
298 STORE_SI128(cgplane, cg_val);
299 a_val = _mm_packus_epi16(a_val, a_val);
300 STORE_SI128(aplane, a_val);
307 if (context->ChromaSubsamplingLevel > 0 && (context->width % 2) == 1)
309 context->priv->PlaneBuffers[0][y * rw + context->width] =
310 context->priv->PlaneBuffers[0][y * rw + context->width - 1];
311 context->priv->PlaneBuffers[1][y * rw + context->width] =
312 context->priv->PlaneBuffers[1][y * rw + context->width - 1];
313 context->priv->PlaneBuffers[2][y * rw + context->width] =
314 context->priv->PlaneBuffers[2][y * rw + context->width - 1];
318 if (context->ChromaSubsamplingLevel > 0 && (y % 2) == 1)
320 BYTE* yplane = context->priv->PlaneBuffers[0] + y * rw;
321 BYTE* coplane = context->priv->PlaneBuffers[1] + y * rw;
322 BYTE* cgplane = context->priv->PlaneBuffers[2] + y * rw;
323 CopyMemory(yplane, yplane - rw, rw);
324 CopyMemory(coplane, coplane - rw, rw);
325 CopyMemory(cgplane, cgplane - rw, rw);
331static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context)
335 INT8* co_src0 = NULL;
336 INT8* co_src1 = NULL;
337 INT8* cg_src0 = NULL;
338 INT8* cg_src1 = NULL;
339 UINT32 tempWidth = 0;
340 UINT32 tempHeight = 0;
343 __m128i mask = _mm_set1_epi16(0xFF);
344 tempWidth = ROUND_UP_TO(context->width, 8);
345 tempHeight = ROUND_UP_TO(context->height, 2);
347 for (
size_t y = 0; y < tempHeight >> 1; y++)
349 co_dst = context->priv->PlaneBuffers[1] + y * (tempWidth >> 1);
350 cg_dst = context->priv->PlaneBuffers[2] + y * (tempWidth >> 1);
351 co_src0 = (INT8*)context->priv->PlaneBuffers[1] + (y << 1) * tempWidth;
352 co_src1 = co_src0 + tempWidth;
353 cg_src0 = (INT8*)context->priv->PlaneBuffers[2] + (y << 1) * tempWidth;
354 cg_src1 = cg_src0 + tempWidth;
356 for (UINT32 x = 0; x < tempWidth >> 1; x += 8)
358 t = LOAD_SI128(co_src0);
359 t = _mm_avg_epu8(t, LOAD_SI128(co_src1));
360 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
361 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
362 val = _mm_packus_epi16(val, val);
363 STORE_SI128(co_dst, val);
367 t = LOAD_SI128(cg_src0);
368 t = _mm_avg_epu8(t, LOAD_SI128(cg_src1));
369 val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
370 val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
371 val = _mm_packus_epi16(val, val);
372 STORE_SI128(cg_dst, val);
380static BOOL nsc_encode_sse2(NSC_CONTEXT* WINPR_RESTRICT context,
const BYTE* WINPR_RESTRICT data,
383 if (!nsc_encode_argb_to_aycocg_sse2(context, data, scanline))
386 if (context->ChromaSubsamplingLevel > 0)
387 nsc_encode_subsampling_sse2(context);
393void nsc_init_sse2_int(NSC_CONTEXT* WINPR_RESTRICT context)
395#if defined(SSE_AVX_INTRINSICS_ENABLED)
396 PROFILER_RENAME(context->priv->prof_nsc_encode,
"nsc_encode_sse2")
397 context->encode = nsc_encode_sse2;
399 WINPR_UNUSED(context);