FreeRDP
Loading...
Searching...
No Matches
rfx_sse2.c
1
21#include <winpr/assert.h>
22#include <winpr/cast.h>
23#include <winpr/platform.h>
24#include <freerdp/config.h>
25
26#include "../rfx_types.h"
27#include "rfx_sse2.h"
28#include "../rfx_quantization.h"
29
30#include "../../core/simd.h"
31#include "../../primitives/sse/prim_avxsse.h"
32
33#if defined(SSE_AVX_INTRINSICS_ENABLED)
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <winpr/sysinfo.h>
38
39#include <xmmintrin.h>
40#include <emmintrin.h>
41
42#ifdef _MSC_VER
43#define __attribute__(...)
44#endif
45
46#define CACHE_LINE_BYTES 64
47
48#ifndef __clang__
49#define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
50#else
51#define ATTRIBUTES __gnu_inline__, __always_inline__
52#endif
53
54static inline void __attribute__((ATTRIBUTES)) mm_prefetch_buffer(char* WINPR_RESTRICT buffer,
55 size_t num_bytes)
56{
57 __m128i* buf = (__m128i*)buffer;
58
59 for (size_t i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
60 {
61 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
62 }
63}
64
65/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
66/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
67
68static inline void __attribute__((ATTRIBUTES))
69rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t buffer_size,
70 const UINT32 factor)
71{
72 __m128i* ptr = (__m128i*)buffer;
73 const __m128i* buf_end = (__m128i*)(buffer + buffer_size);
74
75 if (factor == 0)
76 return;
77
78 do
79 {
80 const __m128i la = LOAD_SI128(ptr);
81 const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(int, factor));
82
83 STORE_SI128(ptr, a);
84 ptr++;
85 } while (ptr < buf_end);
86}
87
88WINPR_ATTR_NODISCARD
89static BOOL rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
90 const UINT32* WINPR_RESTRICT quantVals,
91 size_t nrQuantValues)
92{
93 WINPR_ASSERT(buffer);
94 WINPR_ASSERT(quantVals);
95 WINPR_ASSERT(nrQuantValues == NR_QUANT_VALUES);
96
97 for (size_t x = 0; x < nrQuantValues; x++)
98 {
99 const UINT32 val = quantVals[x];
100 if (val < 1)
101 return FALSE;
102 }
103
104 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
105 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
106 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
107 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
108 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
109 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
110 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
111 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
112 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
113 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
114 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
115 return TRUE;
116}
117
118static inline void __attribute__((ATTRIBUTES))
119rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const unsigned buffer_size,
120 const INT16 factor)
121{
122 __m128i* ptr = (__m128i*)buffer;
123 const __m128i* buf_end = (const __m128i*)(buffer + buffer_size);
124
125 if (factor == 0)
126 return;
127
128 const __m128i half = _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(INT16, 1 << (factor - 1)));
129
130 do
131 {
132 const __m128i la = LOAD_SI128(ptr);
133 __m128i a = _mm_add_epi16(la, half);
134 a = _mm_srai_epi16(a, factor);
135 STORE_SI128(ptr, a);
136 ptr++;
137 } while (ptr < buf_end);
138}
139
140WINPR_ATTR_NODISCARD
141static BOOL rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
142 const UINT32* WINPR_RESTRICT quantization_values,
143 size_t quantVals)
144{
145 WINPR_ASSERT(buffer);
146 WINPR_ASSERT(quantization_values);
147 WINPR_ASSERT(quantVals == NR_QUANT_VALUES);
148
149 for (size_t x = 0; x < quantVals; x++)
150 {
151 const UINT32 val = quantization_values[x];
152 if (val < 6)
153 return FALSE;
154 if (val > INT16_MAX)
155 return FALSE;
156 }
157
158 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
159 rfx_quantization_encode_block_sse2(
160 buffer, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[8] - 6)); /* HL1 */
161 rfx_quantization_encode_block_sse2(
162 buffer + 1024, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[7] - 6)); /* LH1 */
163 rfx_quantization_encode_block_sse2(
164 buffer + 2048, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[9] - 6)); /* HH1 */
165 rfx_quantization_encode_block_sse2(
166 buffer + 3072, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[5] - 6)); /* HL2 */
167 rfx_quantization_encode_block_sse2(
168 buffer + 3328, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[4] - 6)); /* LH2 */
169 rfx_quantization_encode_block_sse2(
170 buffer + 3584, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[6] - 6)); /* HH2 */
171 rfx_quantization_encode_block_sse2(
172 buffer + 3840, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[2] - 6)); /* HL3 */
173 rfx_quantization_encode_block_sse2(
174 buffer + 3904, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[1] - 6)); /* LH3 */
175 rfx_quantization_encode_block_sse2(
176 buffer + 3968, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[3] - 6)); /* HH3 */
177 rfx_quantization_encode_block_sse2(
178 buffer + 4032, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[0] - 6)); /* LL3 */
179 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
180 return TRUE;
181}
182
183static inline void __attribute__((ATTRIBUTES))
184rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
185 INT16* WINPR_RESTRICT dst, size_t subband_width)
186{
187 INT16* l_ptr = l;
188 INT16* h_ptr = h;
189 INT16* dst_ptr = dst;
190 int first = 0;
191 int last = 0;
192 __m128i dst1;
193 __m128i dst2;
194
195 for (size_t y = 0; y < subband_width; y++)
196 {
197 /* Even coefficients */
198 for (size_t n = 0; n < subband_width; n += 8)
199 {
200 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
201 __m128i l_n = LOAD_SI128(l_ptr);
202 __m128i h_n = LOAD_SI128(h_ptr);
203 __m128i h_n_m = LOAD_SI128(h_ptr - 1);
204
205 if (n == 0)
206 {
207 first = _mm_extract_epi16(h_n_m, 1);
208 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
209 }
210
211 __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
212 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
213 tmp_n = _mm_srai_epi16(tmp_n, 1);
214 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
215 STORE_SI128(l_ptr, dst_n);
216 l_ptr += 8;
217 h_ptr += 8;
218 }
219
220 l_ptr -= subband_width;
221 h_ptr -= subband_width;
222
223 /* Odd coefficients */
224 for (size_t n = 0; n < subband_width; n += 8)
225 {
226 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
227 __m128i h_n = LOAD_SI128(h_ptr);
228 h_n = _mm_slli_epi16(h_n, 1);
229 __m128i dst_n = LOAD_SI128(l_ptr);
230 __m128i dst_n_p = LOAD_SI128(l_ptr + 1);
231
232 if (n == subband_width - 8)
233 {
234 last = _mm_extract_epi16(dst_n_p, 6);
235 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
236 }
237
238 __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
239 tmp_n = _mm_srai_epi16(tmp_n, 1);
240 tmp_n = _mm_add_epi16(tmp_n, h_n);
241 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
242 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
243 STORE_SI128(dst_ptr, dst1);
244 STORE_SI128(dst_ptr + 8, dst2);
245 l_ptr += 8;
246 h_ptr += 8;
247 dst_ptr += 16;
248 }
249 }
250}
251
252static inline void __attribute__((ATTRIBUTES))
253rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
254 INT16* WINPR_RESTRICT dst, size_t subband_width)
255{
256 INT16* l_ptr = l;
257 INT16* h_ptr = h;
258 INT16* dst_ptr = dst;
259 const size_t total_width = subband_width + subband_width;
260
261 /* Even coefficients */
262 for (size_t n = 0; n < subband_width; n++)
263 {
264 for (size_t x = 0; x < total_width; x += 8)
265 {
266 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
267 const __m128i l_n = LOAD_SI128(l_ptr);
268 const __m128i h_n = LOAD_SI128(h_ptr);
269 __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
270
271 if (n == 0)
272 tmp_n = _mm_add_epi16(tmp_n, h_n);
273 else
274 {
275 const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
276 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
277 }
278
279 tmp_n = _mm_srai_epi16(tmp_n, 1);
280 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
281 STORE_SI128(dst_ptr, dst_n);
282 l_ptr += 8;
283 h_ptr += 8;
284 dst_ptr += 8;
285 }
286
287 dst_ptr += total_width;
288 }
289
290 h_ptr = h;
291 dst_ptr = dst + total_width;
292
293 /* Odd coefficients */
294 for (size_t n = 0; n < subband_width; n++)
295 {
296 for (size_t x = 0; x < total_width; x += 8)
297 {
298 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
299 __m128i h_n = LOAD_SI128(h_ptr);
300 __m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
301 h_n = _mm_slli_epi16(h_n, 1);
302 __m128i tmp_n = dst_n_m;
303
304 if (n == subband_width - 1)
305 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
306 else
307 {
308 const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
309 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
310 }
311
312 tmp_n = _mm_srai_epi16(tmp_n, 1);
313 const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
314 STORE_SI128(dst_ptr, dst_n);
315 h_ptr += 8;
316 dst_ptr += 8;
317 }
318
319 dst_ptr += total_width;
320 }
321}
322
323static inline void __attribute__((ATTRIBUTES))
324rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
325 size_t subband_width)
326{
327 mm_prefetch_buffer((char*)idwt, 4ULL * subband_width * sizeof(INT16));
328 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
329 */
330 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
331 /* The lower part L uses LL(3) and HL(0). */
332 /* The higher part H uses LH(1) and HH(2). */
333 INT16* ll = buffer + 3ULL * subband_width * subband_width;
334 INT16* hl = buffer;
335 INT16* l_dst = idwt;
336 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
337 INT16* lh = buffer + 1ULL * subband_width * subband_width;
338 INT16* hh = buffer + 2ULL * subband_width * subband_width;
339 INT16* h_dst = idwt + 2ULL * subband_width * subband_width;
340 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
341 /* Inverse DWT in vertical direction, results are stored in original buffer. */
342 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
343}
344
345static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
346{
347 WINPR_ASSERT(buffer);
348 WINPR_ASSERT(dwt_buffer);
349
350 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
351 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
352 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
353 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
354}
355
356static inline void __attribute__((ATTRIBUTES))
357rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
358 INT16* WINPR_RESTRICT h, size_t subband_width)
359{
360 const size_t total_width = subband_width << 1;
361
362 for (size_t n = 0; n < subband_width; n++)
363 {
364 for (size_t x = 0; x < total_width; x += 8)
365 {
366 __m128i src_2n = LOAD_SI128(src);
367 __m128i src_2n_1 = LOAD_SI128(src + total_width);
368 __m128i src_2n_2 = src_2n;
369
370 if (n < subband_width - 1)
371 src_2n_2 = LOAD_SI128(src + 2ULL * total_width);
372
373 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
374 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
375 h_n = _mm_srai_epi16(h_n, 1);
376 h_n = _mm_sub_epi16(src_2n_1, h_n);
377 h_n = _mm_srai_epi16(h_n, 1);
378 STORE_SI128(h, h_n);
379
380 __m128i h_n_m = h_n;
381 if (n != 0)
382 h_n_m = LOAD_SI128(h - total_width);
383
384 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
385 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
386 l_n = _mm_srai_epi16(l_n, 1);
387 l_n = _mm_add_epi16(l_n, src_2n);
388 STORE_SI128(l, l_n);
389 src += 8;
390 l += 8;
391 h += 8;
392 }
393
394 src += total_width;
395 }
396}
397
398static inline void __attribute__((ATTRIBUTES))
399rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
400 INT16* WINPR_RESTRICT h, size_t subband_width)
401{
402 for (size_t y = 0; y < subband_width; y++)
403 {
404 for (size_t n = 0; n < subband_width; n += 8)
405 {
406 /* The following 3 Set operations consumes more than half of the total DWT processing
407 * time! */
408 const INT16 src16 = (INT16)(((n + 8) == subband_width) ? src[14] : src[16]);
409 __m128i src_2n =
410 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
411 __m128i src_2n_1 =
412 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
413 __m128i src_2n_2 =
414 _mm_set_epi16(src16, src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
415 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
416 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
417 h_n = _mm_srai_epi16(h_n, 1);
418 h_n = _mm_sub_epi16(src_2n_1, h_n);
419 h_n = _mm_srai_epi16(h_n, 1);
420 STORE_SI128(h, h_n);
421 __m128i h_n_m = LOAD_SI128(h - 1);
422
423 if (n == 0)
424 {
425 int first = _mm_extract_epi16(h_n_m, 1);
426 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
427 }
428
429 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
430 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
431 l_n = _mm_srai_epi16(l_n, 1);
432 l_n = _mm_add_epi16(l_n, src_2n);
433 STORE_SI128(l, l_n);
434 src += 16;
435 l += 8;
436 h += 8;
437 }
438 }
439}
440
441static inline void __attribute__((ATTRIBUTES))
442rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
443 size_t subband_width)
444{
445 mm_prefetch_buffer((char*)dwt, 4ULL * subband_width * sizeof(INT16));
446 /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
447 INT16* l_src = dwt;
448 INT16* h_src = dwt + 2ULL * subband_width * subband_width;
449 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
450 /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order,
451 * stored in original buffer. */
452 /* The lower part L generates LL(3) and HL(0). */
453 /* The higher part H generates LH(1) and HH(2). */
454 INT16* ll = buffer + 3ULL * subband_width * subband_width;
455 INT16* hl = buffer;
456 INT16* lh = buffer + 1ULL * subband_width * subband_width;
457 INT16* hh = buffer + 2ULL * subband_width * subband_width;
458 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
459 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
460}
461
462static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
463{
464 WINPR_ASSERT(buffer);
465 WINPR_ASSERT(dwt_buffer);
466
467 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
468 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
469 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
470 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
471}
472#endif
473
474void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context)
475{
476#if defined(SSE_AVX_INTRINSICS_ENABLED)
477 WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
478 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2")
479 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
480 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
481 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
482 context->quantization_decode = rfx_quantization_decode_sse2;
483 context->quantization_encode = rfx_quantization_encode_sse2;
484 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
485 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
486#else
487 WINPR_UNUSED(context);
488 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
489#endif
490}