21#include <winpr/assert.h>
22#include <winpr/cast.h>
23#include <winpr/platform.h>
24#include <freerdp/config.h>
26#include "../rfx_types.h"
28#include "../rfx_quantization.h"
30#include "../../core/simd.h"
31#include "../../primitives/sse/prim_avxsse.h"
33#if defined(SSE_AVX_INTRINSICS_ENABLED)
37#include <winpr/sysinfo.h>
43#define __attribute__(...)
46#define CACHE_LINE_BYTES 64
49#define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
51#define ATTRIBUTES __gnu_inline__, __always_inline__
54static inline void __attribute__((ATTRIBUTES)) mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
57 __m128i* buf = (__m128i*)buffer;
59 for (
size_t i = 0; i < (num_bytes /
sizeof(__m128i)); i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
61 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
68static inline void __attribute__((ATTRIBUTES))
69rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer,
const size_t buffer_size,
72 __m128i* ptr = (__m128i*)buffer;
73 const __m128i* buf_end = (__m128i*)(buffer + buffer_size);
80 const __m128i la = LOAD_SI128(ptr);
81 const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(
int, factor));
85 }
while (ptr < buf_end);
89static BOOL rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
90 const UINT32* WINPR_RESTRICT quantVals,
94 WINPR_ASSERT(quantVals);
95 WINPR_ASSERT(nrQuantValues == NR_QUANT_VALUES);
97 for (
size_t x = 0; x < nrQuantValues; x++)
99 const UINT32 val = quantVals[x];
104 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
105 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1);
106 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1);
107 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1);
108 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1);
109 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1);
110 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1);
111 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1);
112 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1);
113 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1);
114 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1);
118static inline void __attribute__((ATTRIBUTES))
119rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer,
const unsigned buffer_size,
122 __m128i* ptr = (__m128i*)buffer;
123 const __m128i* buf_end = (
const __m128i*)(buffer + buffer_size);
128 const __m128i half = _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(INT16, 1 << (factor - 1)));
132 const __m128i la = LOAD_SI128(ptr);
133 __m128i a = _mm_add_epi16(la, half);
134 a = _mm_srai_epi16(a, factor);
137 }
while (ptr < buf_end);
141static BOOL rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
142 const UINT32* WINPR_RESTRICT quantization_values,
145 WINPR_ASSERT(buffer);
146 WINPR_ASSERT(quantization_values);
147 WINPR_ASSERT(quantVals == NR_QUANT_VALUES);
149 for (
size_t x = 0; x < quantVals; x++)
151 const UINT32 val = quantization_values[x];
158 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
159 rfx_quantization_encode_block_sse2(
160 buffer, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[8] - 6));
161 rfx_quantization_encode_block_sse2(
162 buffer + 1024, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[7] - 6));
163 rfx_quantization_encode_block_sse2(
164 buffer + 2048, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[9] - 6));
165 rfx_quantization_encode_block_sse2(
166 buffer + 3072, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[5] - 6));
167 rfx_quantization_encode_block_sse2(
168 buffer + 3328, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[4] - 6));
169 rfx_quantization_encode_block_sse2(
170 buffer + 3584, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[6] - 6));
171 rfx_quantization_encode_block_sse2(
172 buffer + 3840, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[2] - 6));
173 rfx_quantization_encode_block_sse2(
174 buffer + 3904, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[1] - 6));
175 rfx_quantization_encode_block_sse2(
176 buffer + 3968, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[3] - 6));
177 rfx_quantization_encode_block_sse2(
178 buffer + 4032, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[0] - 6));
179 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
183static inline void __attribute__((ATTRIBUTES))
184rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
185 INT16* WINPR_RESTRICT dst,
size_t subband_width)
189 INT16* dst_ptr = dst;
195 for (
size_t y = 0; y < subband_width; y++)
198 for (
size_t n = 0; n < subband_width; n += 8)
201 __m128i l_n = LOAD_SI128(l_ptr);
202 __m128i h_n = LOAD_SI128(h_ptr);
203 __m128i h_n_m = LOAD_SI128(h_ptr - 1);
207 first = _mm_extract_epi16(h_n_m, 1);
208 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
211 __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
212 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
213 tmp_n = _mm_srai_epi16(tmp_n, 1);
214 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
215 STORE_SI128(l_ptr, dst_n);
220 l_ptr -= subband_width;
221 h_ptr -= subband_width;
224 for (
size_t n = 0; n < subband_width; n += 8)
227 __m128i h_n = LOAD_SI128(h_ptr);
228 h_n = _mm_slli_epi16(h_n, 1);
229 __m128i dst_n = LOAD_SI128(l_ptr);
230 __m128i dst_n_p = LOAD_SI128(l_ptr + 1);
232 if (n == subband_width - 8)
234 last = _mm_extract_epi16(dst_n_p, 6);
235 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
238 __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
239 tmp_n = _mm_srai_epi16(tmp_n, 1);
240 tmp_n = _mm_add_epi16(tmp_n, h_n);
241 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
242 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
243 STORE_SI128(dst_ptr, dst1);
244 STORE_SI128(dst_ptr + 8, dst2);
252static inline void __attribute__((ATTRIBUTES))
253rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
254 INT16* WINPR_RESTRICT dst,
size_t subband_width)
258 INT16* dst_ptr = dst;
259 const size_t total_width = subband_width + subband_width;
262 for (
size_t n = 0; n < subband_width; n++)
264 for (
size_t x = 0; x < total_width; x += 8)
267 const __m128i l_n = LOAD_SI128(l_ptr);
268 const __m128i h_n = LOAD_SI128(h_ptr);
269 __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
272 tmp_n = _mm_add_epi16(tmp_n, h_n);
275 const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
276 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
279 tmp_n = _mm_srai_epi16(tmp_n, 1);
280 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
281 STORE_SI128(dst_ptr, dst_n);
287 dst_ptr += total_width;
291 dst_ptr = dst + total_width;
294 for (
size_t n = 0; n < subband_width; n++)
296 for (
size_t x = 0; x < total_width; x += 8)
299 __m128i h_n = LOAD_SI128(h_ptr);
300 __m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
301 h_n = _mm_slli_epi16(h_n, 1);
302 __m128i tmp_n = dst_n_m;
304 if (n == subband_width - 1)
305 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
308 const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
309 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
312 tmp_n = _mm_srai_epi16(tmp_n, 1);
313 const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
314 STORE_SI128(dst_ptr, dst_n);
319 dst_ptr += total_width;
323static inline void __attribute__((ATTRIBUTES))
324rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
325 size_t subband_width)
327 mm_prefetch_buffer((
char*)idwt, 4ULL * subband_width *
sizeof(INT16));
333 INT16* ll = buffer + 3ULL * subband_width * subband_width;
336 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
337 INT16* lh = buffer + 1ULL * subband_width * subband_width;
338 INT16* hh = buffer + 2ULL * subband_width * subband_width;
339 INT16* h_dst = idwt + 2ULL * subband_width * subband_width;
340 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
342 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
345static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
347 WINPR_ASSERT(buffer);
348 WINPR_ASSERT(dwt_buffer);
350 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
351 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
352 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
353 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
356static inline void __attribute__((ATTRIBUTES))
357rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
358 INT16* WINPR_RESTRICT h,
size_t subband_width)
360 const size_t total_width = subband_width << 1;
362 for (
size_t n = 0; n < subband_width; n++)
364 for (
size_t x = 0; x < total_width; x += 8)
366 __m128i src_2n = LOAD_SI128(src);
367 __m128i src_2n_1 = LOAD_SI128(src + total_width);
368 __m128i src_2n_2 = src_2n;
370 if (n < subband_width - 1)
371 src_2n_2 = LOAD_SI128(src + 2ULL * total_width);
374 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
375 h_n = _mm_srai_epi16(h_n, 1);
376 h_n = _mm_sub_epi16(src_2n_1, h_n);
377 h_n = _mm_srai_epi16(h_n, 1);
382 h_n_m = LOAD_SI128(h - total_width);
385 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
386 l_n = _mm_srai_epi16(l_n, 1);
387 l_n = _mm_add_epi16(l_n, src_2n);
398static inline void __attribute__((ATTRIBUTES))
399rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
400 INT16* WINPR_RESTRICT h,
size_t subband_width)
402 for (
size_t y = 0; y < subband_width; y++)
404 for (
size_t n = 0; n < subband_width; n += 8)
408 const INT16 src16 = (INT16)(((n + 8) == subband_width) ? src[14] : src[16]);
410 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
412 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
414 _mm_set_epi16(src16, src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
416 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
417 h_n = _mm_srai_epi16(h_n, 1);
418 h_n = _mm_sub_epi16(src_2n_1, h_n);
419 h_n = _mm_srai_epi16(h_n, 1);
421 __m128i h_n_m = LOAD_SI128(h - 1);
425 int first = _mm_extract_epi16(h_n_m, 1);
426 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
430 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
431 l_n = _mm_srai_epi16(l_n, 1);
432 l_n = _mm_add_epi16(l_n, src_2n);
441static inline void __attribute__((ATTRIBUTES))
442rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
443 size_t subband_width)
445 mm_prefetch_buffer((
char*)dwt, 4ULL * subband_width *
sizeof(INT16));
448 INT16* h_src = dwt + 2ULL * subband_width * subband_width;
449 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
454 INT16* ll = buffer + 3ULL * subband_width * subband_width;
456 INT16* lh = buffer + 1ULL * subband_width * subband_width;
457 INT16* hh = buffer + 2ULL * subband_width * subband_width;
458 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
459 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
462static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
464 WINPR_ASSERT(buffer);
465 WINPR_ASSERT(dwt_buffer);
467 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
468 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
469 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
470 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
474void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context)
476#if defined(SSE_AVX_INTRINSICS_ENABLED)
477 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
478 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
"rfx_quantization_decode_sse2")
479 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
480 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
481 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
482 context->quantization_decode = rfx_quantization_decode_sse2;
483 context->quantization_encode = rfx_quantization_encode_sse2;
484 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
485 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
487 WINPR_UNUSED(context);
488 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE2 intrinsics not available");