FreeRDP
Loading...
Searching...
No Matches
rfx_sse2.c
1
21#include <winpr/assert.h>
22#include <winpr/cast.h>
23#include <winpr/platform.h>
24#include <freerdp/config.h>
25
26#include "../rfx_types.h"
27#include "rfx_sse2.h"
28
29#include "../../core/simd.h"
30#include "../../primitives/sse/prim_avxsse.h"
31
32#if defined(SSE_AVX_INTRINSICS_ENABLED)
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <winpr/sysinfo.h>
37
38#include <xmmintrin.h>
39#include <emmintrin.h>
40
41#ifdef _MSC_VER
42#define __attribute__(...)
43#endif
44
45#define CACHE_LINE_BYTES 64
46
47#ifndef __clang__
48#define ATTRIBUTES __gnu_inline__, __always_inline__, __artificial__
49#else
50#define ATTRIBUTES __gnu_inline__, __always_inline__
51#endif
52
53static __inline void __attribute__((ATTRIBUTES))
54mm_prefetch_buffer(char* WINPR_RESTRICT buffer, size_t num_bytes)
55{
56 __m128i* buf = (__m128i*)buffer;
57
58 for (size_t i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
59 {
60 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
61 }
62}
63
64/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
65/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
66
67static __inline void __attribute__((ATTRIBUTES))
68rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t buffer_size,
69 const UINT32 factor)
70{
71 __m128i* ptr = (__m128i*)buffer;
72 const __m128i* buf_end = (__m128i*)(buffer + buffer_size);
73
74 if (factor == 0)
75 return;
76
77 do
78 {
79 const __m128i la = LOAD_SI128(ptr);
80 const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(int, factor));
81
82 STORE_SI128(ptr, a);
83 ptr++;
84 } while (ptr < buf_end);
85}
86
87static void rfx_quantization_decode_sse2(INT16* WINPR_RESTRICT buffer,
88 const UINT32* WINPR_RESTRICT quantVals)
89{
90 WINPR_ASSERT(buffer);
91 WINPR_ASSERT(quantVals);
92
93 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
94 rfx_quantization_decode_block_sse2(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
95 rfx_quantization_decode_block_sse2(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
96 rfx_quantization_decode_block_sse2(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
97 rfx_quantization_decode_block_sse2(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
98 rfx_quantization_decode_block_sse2(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
99 rfx_quantization_decode_block_sse2(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
100 rfx_quantization_decode_block_sse2(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
101 rfx_quantization_decode_block_sse2(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
102 rfx_quantization_decode_block_sse2(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
103 rfx_quantization_decode_block_sse2(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
104}
105
106static __inline void __attribute__((ATTRIBUTES))
107rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const unsigned buffer_size,
108 const INT16 factor)
109{
110 __m128i* ptr = (__m128i*)buffer;
111 const __m128i* buf_end = (const __m128i*)(buffer + buffer_size);
112
113 if (factor == 0)
114 return;
115
116 const __m128i half = _mm_set1_epi16(WINPR_ASSERTING_INT_CAST(INT16, 1 << (factor - 1)));
117
118 do
119 {
120 const __m128i la = LOAD_SI128(ptr);
121 __m128i a = _mm_add_epi16(la, half);
122 a = _mm_srai_epi16(a, factor);
123 STORE_SI128(ptr, a);
124 ptr++;
125 } while (ptr < buf_end);
126}
127
128static void rfx_quantization_encode_sse2(INT16* WINPR_RESTRICT buffer,
129 const UINT32* WINPR_RESTRICT quantization_values)
130{
131 WINPR_ASSERT(buffer);
132 WINPR_ASSERT(quantization_values);
133 for (size_t x = 0; x < 10; x++)
134 {
135 WINPR_ASSERT(quantization_values[x] >= 6);
136 WINPR_ASSERT(quantization_values[x] <= INT16_MAX + 6);
137 }
138
139 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
140 rfx_quantization_encode_block_sse2(
141 buffer, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[8] - 6)); /* HL1 */
142 rfx_quantization_encode_block_sse2(
143 buffer + 1024, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[7] - 6)); /* LH1 */
144 rfx_quantization_encode_block_sse2(
145 buffer + 2048, 1024, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[9] - 6)); /* HH1 */
146 rfx_quantization_encode_block_sse2(
147 buffer + 3072, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[5] - 6)); /* HL2 */
148 rfx_quantization_encode_block_sse2(
149 buffer + 3328, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[4] - 6)); /* LH2 */
150 rfx_quantization_encode_block_sse2(
151 buffer + 3584, 256, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[6] - 6)); /* HH2 */
152 rfx_quantization_encode_block_sse2(
153 buffer + 3840, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[2] - 6)); /* HL3 */
154 rfx_quantization_encode_block_sse2(
155 buffer + 3904, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[1] - 6)); /* LH3 */
156 rfx_quantization_encode_block_sse2(
157 buffer + 3968, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[3] - 6)); /* HH3 */
158 rfx_quantization_encode_block_sse2(
159 buffer + 4032, 64, WINPR_ASSERTING_INT_CAST(INT16, quantization_values[0] - 6)); /* LL3 */
160 rfx_quantization_encode_block_sse2(buffer, 4096, 5);
161}
162
163static __inline void __attribute__((ATTRIBUTES))
164rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
165 INT16* WINPR_RESTRICT dst, size_t subband_width)
166{
167 INT16* l_ptr = l;
168 INT16* h_ptr = h;
169 INT16* dst_ptr = dst;
170 int first = 0;
171 int last = 0;
172 __m128i dst1;
173 __m128i dst2;
174
175 for (size_t y = 0; y < subband_width; y++)
176 {
177 /* Even coefficients */
178 for (size_t n = 0; n < subband_width; n += 8)
179 {
180 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
181 __m128i l_n = LOAD_SI128(l_ptr);
182 __m128i h_n = LOAD_SI128(h_ptr);
183 __m128i h_n_m = LOAD_SI128(h_ptr - 1);
184
185 if (n == 0)
186 {
187 first = _mm_extract_epi16(h_n_m, 1);
188 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
189 }
190
191 __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
192 tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
193 tmp_n = _mm_srai_epi16(tmp_n, 1);
194 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
195 STORE_SI128(l_ptr, dst_n);
196 l_ptr += 8;
197 h_ptr += 8;
198 }
199
200 l_ptr -= subband_width;
201 h_ptr -= subband_width;
202
203 /* Odd coefficients */
204 for (size_t n = 0; n < subband_width; n += 8)
205 {
206 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
207 __m128i h_n = LOAD_SI128(h_ptr);
208 h_n = _mm_slli_epi16(h_n, 1);
209 __m128i dst_n = LOAD_SI128(l_ptr);
210 __m128i dst_n_p = LOAD_SI128(l_ptr + 1);
211
212 if (n == subband_width - 8)
213 {
214 last = _mm_extract_epi16(dst_n_p, 6);
215 dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
216 }
217
218 __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
219 tmp_n = _mm_srai_epi16(tmp_n, 1);
220 tmp_n = _mm_add_epi16(tmp_n, h_n);
221 dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
222 dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
223 STORE_SI128(dst_ptr, dst1);
224 STORE_SI128(dst_ptr + 8, dst2);
225 l_ptr += 8;
226 h_ptr += 8;
227 dst_ptr += 16;
228 }
229 }
230}
231
232static __inline void __attribute__((ATTRIBUTES))
233rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
234 INT16* WINPR_RESTRICT dst, size_t subband_width)
235{
236 INT16* l_ptr = l;
237 INT16* h_ptr = h;
238 INT16* dst_ptr = dst;
239 const size_t total_width = subband_width + subband_width;
240
241 /* Even coefficients */
242 for (size_t n = 0; n < subband_width; n++)
243 {
244 for (size_t x = 0; x < total_width; x += 8)
245 {
246 /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
247 const __m128i l_n = LOAD_SI128(l_ptr);
248 const __m128i h_n = LOAD_SI128(h_ptr);
249 __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
250
251 if (n == 0)
252 tmp_n = _mm_add_epi16(tmp_n, h_n);
253 else
254 {
255 const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
256 tmp_n = _mm_add_epi16(tmp_n, h_n_m);
257 }
258
259 tmp_n = _mm_srai_epi16(tmp_n, 1);
260 const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
261 STORE_SI128(dst_ptr, dst_n);
262 l_ptr += 8;
263 h_ptr += 8;
264 dst_ptr += 8;
265 }
266
267 dst_ptr += total_width;
268 }
269
270 h_ptr = h;
271 dst_ptr = dst + total_width;
272
273 /* Odd coefficients */
274 for (size_t n = 0; n < subband_width; n++)
275 {
276 for (size_t x = 0; x < total_width; x += 8)
277 {
278 /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
279 __m128i h_n = LOAD_SI128(h_ptr);
280 __m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
281 h_n = _mm_slli_epi16(h_n, 1);
282 __m128i tmp_n = dst_n_m;
283
284 if (n == subband_width - 1)
285 tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
286 else
287 {
288 const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
289 tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
290 }
291
292 tmp_n = _mm_srai_epi16(tmp_n, 1);
293 const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
294 STORE_SI128(dst_ptr, dst_n);
295 h_ptr += 8;
296 dst_ptr += 8;
297 }
298
299 dst_ptr += total_width;
300 }
301}
302
303static __inline void __attribute__((ATTRIBUTES))
304rfx_dwt_2d_decode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
305 size_t subband_width)
306{
307 mm_prefetch_buffer((char*)idwt, 4ULL * subband_width * sizeof(INT16));
308 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
309 */
310 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
311 /* The lower part L uses LL(3) and HL(0). */
312 /* The higher part H uses LH(1) and HH(2). */
313 INT16* ll = buffer + 3ULL * subband_width * subband_width;
314 INT16* hl = buffer;
315 INT16* l_dst = idwt;
316 rfx_dwt_2d_decode_block_horiz_sse2(ll, hl, l_dst, subband_width);
317 INT16* lh = buffer + 1ULL * subband_width * subband_width;
318 INT16* hh = buffer + 2ULL * subband_width * subband_width;
319 INT16* h_dst = idwt + 2ULL * subband_width * subband_width;
320 rfx_dwt_2d_decode_block_horiz_sse2(lh, hh, h_dst, subband_width);
321 /* Inverse DWT in vertical direction, results are stored in original buffer. */
322 rfx_dwt_2d_decode_block_vert_sse2(l_dst, h_dst, buffer, subband_width);
323}
324
325static void rfx_dwt_2d_decode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
326{
327 WINPR_ASSERT(buffer);
328 WINPR_ASSERT(dwt_buffer);
329
330 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
331 rfx_dwt_2d_decode_block_sse2(&buffer[3840], dwt_buffer, 8);
332 rfx_dwt_2d_decode_block_sse2(&buffer[3072], dwt_buffer, 16);
333 rfx_dwt_2d_decode_block_sse2(&buffer[0], dwt_buffer, 32);
334}
335
336static __inline void __attribute__((ATTRIBUTES))
337rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
338 INT16* WINPR_RESTRICT h, size_t subband_width)
339{
340 const size_t total_width = subband_width << 1;
341
342 for (size_t n = 0; n < subband_width; n++)
343 {
344 for (size_t x = 0; x < total_width; x += 8)
345 {
346 __m128i src_2n = LOAD_SI128(src);
347 __m128i src_2n_1 = LOAD_SI128(src + total_width);
348 __m128i src_2n_2 = src_2n;
349
350 if (n < subband_width - 1)
351 src_2n_2 = LOAD_SI128(src + 2ULL * total_width);
352
353 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
354 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
355 h_n = _mm_srai_epi16(h_n, 1);
356 h_n = _mm_sub_epi16(src_2n_1, h_n);
357 h_n = _mm_srai_epi16(h_n, 1);
358 STORE_SI128(h, h_n);
359
360 __m128i h_n_m = h_n;
361 if (n != 0)
362 h_n_m = LOAD_SI128(h - total_width);
363
364 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
365 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
366 l_n = _mm_srai_epi16(l_n, 1);
367 l_n = _mm_add_epi16(l_n, src_2n);
368 STORE_SI128(l, l_n);
369 src += 8;
370 l += 8;
371 h += 8;
372 }
373
374 src += total_width;
375 }
376}
377
378static __inline void __attribute__((ATTRIBUTES))
379rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRICT l,
380 INT16* WINPR_RESTRICT h, size_t subband_width)
381{
382 for (size_t y = 0; y < subband_width; y++)
383 {
384 for (size_t n = 0; n < subband_width; n += 8)
385 {
386 /* The following 3 Set operations consumes more than half of the total DWT processing
387 * time! */
388 const INT16 src16 = (INT16)(((n + 8) == subband_width) ? src[14] : src[16]);
389 __m128i src_2n =
390 _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
391 __m128i src_2n_1 =
392 _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
393 __m128i src_2n_2 =
394 _mm_set_epi16(src16, src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
395 /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
396 __m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
397 h_n = _mm_srai_epi16(h_n, 1);
398 h_n = _mm_sub_epi16(src_2n_1, h_n);
399 h_n = _mm_srai_epi16(h_n, 1);
400 STORE_SI128(h, h_n);
401 __m128i h_n_m = LOAD_SI128(h - 1);
402
403 if (n == 0)
404 {
405 int first = _mm_extract_epi16(h_n_m, 1);
406 h_n_m = _mm_insert_epi16(h_n_m, first, 0);
407 }
408
409 /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
410 __m128i l_n = _mm_add_epi16(h_n_m, h_n);
411 l_n = _mm_srai_epi16(l_n, 1);
412 l_n = _mm_add_epi16(l_n, src_2n);
413 STORE_SI128(l, l_n);
414 src += 16;
415 l += 8;
416 h += 8;
417 }
418 }
419}
420
421static __inline void __attribute__((ATTRIBUTES))
422rfx_dwt_2d_encode_block_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt,
423 size_t subband_width)
424{
425 mm_prefetch_buffer((char*)dwt, 4ULL * subband_width * sizeof(INT16));
426 /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
427 INT16* l_src = dwt;
428 INT16* h_src = dwt + 2ULL * subband_width * subband_width;
429 rfx_dwt_2d_encode_block_vert_sse2(buffer, l_src, h_src, subband_width);
430 /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order,
431 * stored in original buffer. */
432 /* The lower part L generates LL(3) and HL(0). */
433 /* The higher part H generates LH(1) and HH(2). */
434 INT16* ll = buffer + 3ULL * subband_width * subband_width;
435 INT16* hl = buffer;
436 INT16* lh = buffer + 1ULL * subband_width * subband_width;
437 INT16* hh = buffer + 2ULL * subband_width * subband_width;
438 rfx_dwt_2d_encode_block_horiz_sse2(l_src, ll, hl, subband_width);
439 rfx_dwt_2d_encode_block_horiz_sse2(h_src, lh, hh, subband_width);
440}
441
442static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT dwt_buffer)
443{
444 WINPR_ASSERT(buffer);
445 WINPR_ASSERT(dwt_buffer);
446
447 mm_prefetch_buffer((char*)buffer, 4096 * sizeof(INT16));
448 rfx_dwt_2d_encode_block_sse2(buffer, dwt_buffer, 32);
449 rfx_dwt_2d_encode_block_sse2(buffer + 3072, dwt_buffer, 16);
450 rfx_dwt_2d_encode_block_sse2(buffer + 3840, dwt_buffer, 8);
451}
452#endif
453
454void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context)
455{
456#if defined(SSE_AVX_INTRINSICS_ENABLED)
457 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2")
458 PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2")
459 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2")
460 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode_sse2")
461 context->quantization_decode = rfx_quantization_decode_sse2;
462 context->quantization_encode = rfx_quantization_encode_sse2;
463 context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
464 context->dwt_2d_encode = rfx_dwt_2d_encode_sse2;
465#else
466 WINPR_UNUSED(context);
467#endif
468}