21#include <winpr/assert.h>
22#include <winpr/cast.h>
23#include <winpr/platform.h>
24#include <freerdp/config.h>
26#include "../rfx_types.h"
29#include "../../core/simd.h"
30#include "../../primitives/sse/prim_avxsse.h"
32#if defined(SSE_AVX_INTRINSICS_ENABLED)
36#include <winpr/sysinfo.h>
42#define __attribute__(...)
45#define CACHE_LINE_BYTES 64
48#define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
50#define ATTRIBUTES __gnu_inline__, __always_inline__
53static __inline
void __attribute__((ATTRIBUTES))
54mm_prefetch_buffer(
char* WINPR_RESTRICT buffer,
size_t num_bytes)
56 __m128i* buf = (__m128i*)buffer;
58 for (
size_t i = 0; i < (num_bytes /
sizeof(__m128i)); i += (CACHE_LINE_BYTES /
sizeof(__m128i)))
60 _mm_prefetch((
char*)(&buf[i]), _MM_HINT_NTA);
67static __inline
void __attribute__((ATTRIBUTES))
68rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer,
const size_t buffer_size,
71 __m128i* ptr = (__m128i*)buffer;
72 const __m128i* buf_end = (__m128i*)(buffer + buffer_size);
79 const __m128i la = LOAD_SI128(ptr);
80 const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(
int, factor));
84 }
while (ptr < buf_end);
87static void rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
88 const UINT32* WINPR_RESTRICT quantVals)
91 WINPR_ASSERT(quantVals);
93 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
94 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1);
95 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1);
96 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1);
97 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1);
98 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1);
99 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1);
100 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1);
101 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1);
102 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1);
103 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1);
106static __inline
void __attribute__((ATTRIBUTES))
107rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer,
const unsigned buffer_size,
110 __m128i* ptr = (__m128i*)buffer;
111 const __m128i* buf_end = (
const __m128i*)(buffer + buffer_size);
116 const __m128i half = _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(INT16, 1 << (factor - 1)));
120 const __m128i la = LOAD_SI128(ptr);
121 __m128i a = _mm_add_epi16(la, half);
122 a = _mm_srai_epi16(a, factor);
125 }
while (ptr < buf_end);
128static void rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
129 const UINT32* WINPR_RESTRICT quantization_values)
131 WINPR_ASSERT(buffer);
132 WINPR_ASSERT(quantization_values);
133 for (
size_t x = 0; x < 10; x++)
135 WINPR_ASSERT(quantization_values[x] >= 6);
136 WINPR_ASSERT(quantization_values[x] <= INT16_MAX + 6);
139 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
140 rfx_quantization_encode_block_sse2(
141 buffer, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[8] - 6));
142 rfx_quantization_encode_block_sse2(
143 buffer + 1024, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[7] - 6));
144 rfx_quantization_encode_block_sse2(
145 buffer + 2048, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[9] - 6));
146 rfx_quantization_encode_block_sse2(
147 buffer + 3072, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[5] - 6));
148 rfx_quantization_encode_block_sse2(
149 buffer + 3328, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[4] - 6));
150 rfx_quantization_encode_block_sse2(
151 buffer + 3584, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[6] - 6));
152 rfx_quantization_encode_block_sse2(
153 buffer + 3840, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[2] - 6));
154 rfx_quantization_encode_block_sse2(
155 buffer + 3904, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[1] - 6));
156 rfx_quantization_encode_block_sse2(
157 buffer + 3968, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[3] - 6));
158 rfx_quantization_encode_block_sse2(
159 buffer + 4032, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[0] - 6));
160 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
163static __inline
void __attribute__((ATTRIBUTES))
164rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
165 INT16* WINPR_RESTRICT dst,
size_t subband_width)
169 INT16* dst_ptr = dst;
175 for (
size_t y = 0; y < subband_width; y++)
178 for (
size_t n = 0; n < subband_width; n += 8)
181 __m128i l_n = LOAD_SI128(l_ptr);
182 __m128i h_n = LOAD_SI128(h_ptr);
183 __m128i h_n_m = LOAD_SI128(h_ptr - 1);
187 first = _mm_extract_epi16(h_n_m, 1);
188 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
191 __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
192 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
193 tmp_n = _mm_srai_epi16(tmp_n, 1);
194 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
195 STORE_SI128(l_ptr, dst_n);
200 l_ptr -= subband_width;
201 h_ptr -= subband_width;
204 for (
size_t n = 0; n < subband_width; n += 8)
207 __m128i h_n = LOAD_SI128(h_ptr);
208 h_n = _mm_slli_epi16(h_n, 1);
209 __m128i dst_n = LOAD_SI128(l_ptr);
210 __m128i dst_n_p = LOAD_SI128(l_ptr + 1);
212 if (n == subband_width - 8)
214 last = _mm_extract_epi16(dst_n_p, 6);
215 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
218 __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
219 tmp_n = _mm_srai_epi16(tmp_n, 1);
220 tmp_n = _mm_add_epi16(tmp_n, h_n);
221 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
222 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
223 STORE_SI128(dst_ptr, dst1);
224 STORE_SI128(dst_ptr + 8, dst2);
232static __inline
void __attribute__((ATTRIBUTES))
233rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
234 INT16* WINPR_RESTRICT dst,
size_t subband_width)
238 INT16* dst_ptr = dst;
239 const size_t total_width = subband_width + subband_width;
242 for (
size_t n = 0; n < subband_width; n++)
244 for (
size_t x = 0; x < total_width; x += 8)
247 const __m128i l_n = LOAD_SI128(l_ptr);
248 const __m128i h_n = LOAD_SI128(h_ptr);
249 __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
252 tmp_n = _mm_add_epi16(tmp_n, h_n);
255 const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
256 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
259 tmp_n = _mm_srai_epi16(tmp_n, 1);
260 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
261 STORE_SI128(dst_ptr, dst_n);
267 dst_ptr += total_width;
271 dst_ptr = dst + total_width;
274 for (
size_t n = 0; n < subband_width; n++)
276 for (
size_t x = 0; x < total_width; x += 8)
279 __m128i h_n = LOAD_SI128(h_ptr);
280 __m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
281 h_n = _mm_slli_epi16(h_n, 1);
282 __m128i tmp_n = dst_n_m;
284 if (n == subband_width - 1)
285 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
288 const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
289 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
292 tmp_n = _mm_srai_epi16(tmp_n, 1);
293 const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
294 STORE_SI128(dst_ptr, dst_n);
299 dst_ptr += total_width;
303static __inline
void __attribute__((ATTRIBUTES))
304rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
305 size_t subband_width)
307 mm_prefetch_buffer((
char*)idwt, 4ULL * subband_width *
sizeof(INT16));
313 INT16* ll = buffer + 3ULL * subband_width * subband_width;
316 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
317 INT16* lh = buffer + 1ULL * subband_width * subband_width;
318 INT16* hh = buffer + 2ULL * subband_width * subband_width;
319 INT16* h_dst = idwt + 2ULL * subband_width * subband_width;
320 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
322 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
325static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
327 WINPR_ASSERT(buffer);
328 WINPR_ASSERT(dwt_buffer);
330 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
331 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
332 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
333 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
336static __inline
void __attribute__((ATTRIBUTES))
337rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
338 INT16* WINPR_RESTRICT h,
size_t subband_width)
340 const size_t total_width = subband_width << 1;
342 for (
size_t n = 0; n < subband_width; n++)
344 for (
size_t x = 0; x < total_width; x += 8)
346 __m128i src_2n = LOAD_SI128(src);
347 __m128i src_2n_1 = LOAD_SI128(src + total_width);
348 __m128i src_2n_2 = src_2n;
350 if (n < subband_width - 1)
351 src_2n_2 = LOAD_SI128(src + 2ULL * total_width);
354 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
355 h_n = _mm_srai_epi16(h_n, 1);
356 h_n = _mm_sub_epi16(src_2n_1, h_n);
357 h_n = _mm_srai_epi16(h_n, 1);
362 h_n_m = LOAD_SI128(h - total_width);
365 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
366 l_n = _mm_srai_epi16(l_n, 1);
367 l_n = _mm_add_epi16(l_n, src_2n);
378static __inline
void __attribute__((ATTRIBUTES))
379rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
380 INT16* WINPR_RESTRICT h,
size_t subband_width)
382 for (
size_t y = 0; y < subband_width; y++)
384 for (
size_t n = 0; n < subband_width; n += 8)
388 const INT16 src16 = (INT16)(((n + 8) == subband_width) ? src[14] : src[16]);
390 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
392 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
394 _mm_set_epi16(src16, src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
396 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
397 h_n = _mm_srai_epi16(h_n, 1);
398 h_n = _mm_sub_epi16(src_2n_1, h_n);
399 h_n = _mm_srai_epi16(h_n, 1);
401 __m128i h_n_m = LOAD_SI128(h - 1);
405 int first = _mm_extract_epi16(h_n_m, 1);
406 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
410 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
411 l_n = _mm_srai_epi16(l_n, 1);
412 l_n = _mm_add_epi16(l_n, src_2n);
421static __inline
void __attribute__((ATTRIBUTES))
422rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
423 size_t subband_width)
425 mm_prefetch_buffer((
char*)dwt, 4ULL * subband_width *
sizeof(INT16));
428 INT16* h_src = dwt + 2ULL * subband_width * subband_width;
429 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
434 INT16* ll = buffer + 3ULL * subband_width * subband_width;
436 INT16* lh = buffer + 1ULL * subband_width * subband_width;
437 INT16* hh = buffer + 2ULL * subband_width * subband_width;
438 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
439 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
442static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
444 WINPR_ASSERT(buffer);
445 WINPR_ASSERT(dwt_buffer);
447 mm_prefetch_buffer((
char*)buffer, 4096 *
sizeof(INT16));
448 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
449 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
450 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
454void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context)
456#if defined(SSE_AVX_INTRINSICS_ENABLED)
457 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
"rfx_quantization_decode_sse2")
458 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
459 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
460 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
461 context->quantization_decode = rfx_quantization_decode_sse2;
462 context->quantization_encode = rfx_quantization_encode_sse2;
463 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
464 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
466 WINPR_UNUSED(context);