FreeRDP
Loading...
Searching...
No Matches
rfx_neon.c
1/*
2 FreeRDP: A Remote Desktop Protocol Implementation
3 RemoteFX Codec Library - NEON Optimizations
4
5 Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18*/
19
20#include <winpr/platform.h>
21#include <freerdp/config.h>
22#include <freerdp/log.h>
23
24#include "../rfx_types.h"
25#include "rfx_neon.h"
26
27#include "../../core/simd.h"
28
29#if defined(NEON_INTRINSICS_ENABLED)
30
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <arm_neon.h>
35#include <winpr/sysinfo.h>
36
37/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
38
39static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
40rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
41{
42 int16x8_t quantFactors = vdupq_n_s16(factor);
43 int16x8_t* buf = (int16x8_t*)buffer;
44 int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
45
46 do
47 {
48 int16x8_t val = vld1q_s16((INT16*)buf);
49 val = vshlq_s16(val, quantFactors);
50 vst1q_s16((INT16*)buf, val);
51 buf++;
52 } while (buf < buf_end);
53}
54
55static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
56{
57 WINPR_ASSERT(buffer);
58 WINPR_ASSERT(quantVals);
59
60 rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
61 rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
62 rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
63 rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
64 rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
65 rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
66 rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
67 rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
68 rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
69 rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
70}
71
72static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
74 INT16* WINPR_RESTRICT dst, size_t subband_width)
75{
76 INT16* l_ptr = l;
77 INT16* h_ptr = h;
78 INT16* dst_ptr = dst;
79
80 for (size_t y = 0; y < subband_width; y++)
81 {
82 /* Even coefficients */
83 for (size_t n = 0; n < subband_width; n += 8)
84 {
85 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
86 int16x8_t l_n = vld1q_s16(l_ptr);
87 int16x8_t h_n = vld1q_s16(h_ptr);
88 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
89
90 if (n == 0)
91 {
92 int16_t first = vgetq_lane_s16(h_n_m, 1);
93 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
94 }
95
96 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
97 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
98 tmp_n = vshrq_n_s16(tmp_n, 1);
99 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
100 vst1q_s16(l_ptr, dst_n);
101 l_ptr += 8;
102 h_ptr += 8;
103 }
104
105 l_ptr -= subband_width;
106 h_ptr -= subband_width;
107
108 /* Odd coefficients */
109 for (size_t n = 0; n < subband_width; n += 8)
110 {
111 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
112 int16x8_t h_n = vld1q_s16(h_ptr);
113 h_n = vshlq_n_s16(h_n, 1);
114 int16x8x2_t dst_n;
115 dst_n.val[0] = vld1q_s16(l_ptr);
116 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
117
118 if (n == subband_width - 8)
119 {
120 int16_t last = vgetq_lane_s16(dst_n_p, 6);
121 dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
122 }
123
124 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
125 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
126 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
127 vst2q_s16(dst_ptr, dst_n);
128 l_ptr += 8;
129 h_ptr += 8;
130 dst_ptr += 16;
131 }
132 }
133}
134
135static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
137 INT16* WINPR_RESTRICT dst, size_t subband_width)
138{
139 INT16* l_ptr = l;
140 INT16* h_ptr = h;
141 INT16* dst_ptr = dst;
142 const size_t total_width = subband_width + subband_width;
143
144 /* Even coefficients */
145 for (size_t n = 0; n < subband_width; n++)
146 {
147 for (size_t x = 0; x < total_width; x += 8)
148 {
149 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
150 int16x8_t l_n = vld1q_s16(l_ptr);
151 int16x8_t h_n = vld1q_s16(h_ptr);
152 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
153
154 if (n == 0)
155 tmp_n = vaddq_s16(tmp_n, h_n);
156 else
157 {
158 int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
159 tmp_n = vaddq_s16(tmp_n, h_n_m);
160 }
161
162 tmp_n = vshrq_n_s16(tmp_n, 1);
163 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
164 vst1q_s16(dst_ptr, dst_n);
165 l_ptr += 8;
166 h_ptr += 8;
167 dst_ptr += 8;
168 }
169
170 dst_ptr += total_width;
171 }
172
173 h_ptr = h;
174 dst_ptr = dst + total_width;
175
176 /* Odd coefficients */
177 for (size_t n = 0; n < subband_width; n++)
178 {
179 for (size_t x = 0; x < total_width; x += 8)
180 {
181 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
182 int16x8_t h_n = vld1q_s16(h_ptr);
183 int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
184 h_n = vshlq_n_s16(h_n, 1);
185 int16x8_t tmp_n = dst_n_m;
186
187 if (n == subband_width - 1)
188 tmp_n = vaddq_s16(tmp_n, dst_n_m);
189 else
190 {
191 int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
192 tmp_n = vaddq_s16(tmp_n, dst_n_p);
193 }
194
195 tmp_n = vshrq_n_s16(tmp_n, 1);
196 int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
197 vst1q_s16(dst_ptr, dst_n);
198 h_ptr += 8;
199 dst_ptr += 8;
200 }
201
202 dst_ptr += total_width;
203 }
204}
205
206static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
208 size_t subband_width)
209{
210 INT16 *hl, *lh, *hh, *ll;
211 INT16 *l_dst, *h_dst;
212 /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
213 */
214 /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
215 /* The lower part L uses LL(3) and HL(0). */
216 /* The higher part H uses LH(1) and HH(2). */
217 ll = buffer + subband_width * subband_width * 3;
218 hl = buffer;
219 l_dst = idwt;
220 rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
221 lh = buffer + subband_width * subband_width;
222 hh = buffer + subband_width * subband_width * 2;
223 h_dst = idwt + subband_width * subband_width * 2;
224 rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
225 /* Inverse DWT in vertical direction, results are stored in original buffer. */
226 rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
227}
228
229static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
230{
231 rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
232 rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
233 rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
234}
235
236static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
237 const INT16* restrict pHighBand,
238 size_t nHighStep, INT16* restrict pDstBand,
239 size_t nDstStep, size_t nLowCount,
240 size_t nHighCount, size_t nDstCount)
241{
242 WINPR_ASSERT(pLowBand);
243 WINPR_ASSERT(pHighBand);
244 WINPR_ASSERT(pDstBand);
245
246 INT16* l_ptr = pLowBand;
247 const INT16* h_ptr = pHighBand;
248 INT16* dst_ptr = pDstBand;
249 size_t batchSize = (nLowCount + nHighCount) >> 1;
250
251 for (size_t y = 0; y < nDstCount; y++)
252 {
253 /* Even coefficients */
254 size_t n = 0;
255 for (; n < batchSize; n += 8)
256 {
257 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
258 int16x8_t l_n = vld1q_s16(l_ptr);
259 int16x8_t h_n = vld1q_s16(h_ptr);
260 int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
261
262 if (n == 0)
263 {
264 int16_t first = vgetq_lane_s16(h_n_m, 1);
265 h_n_m = vsetq_lane_s16(first, h_n_m, 0);
266 }
267 else if (n == 24)
268 h_n = vsetq_lane_s16(0, h_n, 7);
269
270 int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
271 tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
272 tmp_n = vshrq_n_s16(tmp_n, 1);
273 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
274 vst1q_s16(l_ptr, dst_n);
275 l_ptr += 8;
276 h_ptr += 8;
277 }
278 if (n < 32)
279 *l_ptr -= *(h_ptr - 1);
280
281 l_ptr -= batchSize;
282 h_ptr -= batchSize;
283
284 /* Odd coefficients */
285 n = 0;
286 for (; n < batchSize; n += 8)
287 {
288 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
289 int16x8_t h_n = vld1q_s16(h_ptr);
290 h_n = vshlq_n_s16(h_n, 1);
291 int16x8x2_t dst_n;
292 dst_n.val[0] = vld1q_s16(l_ptr);
293 int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
294
295 if (n == 24)
296 h_n = vsetq_lane_s16(0, h_n, 7);
297
298 dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
299 dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
300 dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
301 vst2q_s16(dst_ptr, dst_n);
302 l_ptr += 8;
303 h_ptr += 8;
304 dst_ptr += 16;
305 }
306 if (n == 32)
307 {
308 h_ptr -= 1;
309 l_ptr += 1;
310 }
311 else
312 {
313 *dst_ptr = *l_ptr;
314 l_ptr += 1;
315 dst_ptr += 1;
316 }
317 }
318}
319
320static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
321 const INT16* restrict pHighBand, size_t nHighStep,
322 INT16* restrict pDstBand, size_t nDstStep,
323 size_t nLowCount, size_t nHighCount,
324 size_t nDstCount)
325{
326 WINPR_ASSERT(pLowBand);
327 WINPR_ASSERT(pHighBand);
328 WINPR_ASSERT(pDstBand);
329
330 const INT16* l_ptr = pLowBand;
331 const INT16* h_ptr = pHighBand;
332 INT16* dst_ptr = pDstBand;
333 size_t batchSize = (nDstCount >> 3) << 3;
334 size_t forceBandSize = (nLowCount + nHighCount) >> 1;
335
336 /* Even coefficients */
337 for (size_t n = 0; n < forceBandSize; n++)
338 {
339 for (size_t x = 0; x < batchSize; x += 8)
340 {
341 // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
342 int16x8_t l_n = vld1q_s16(l_ptr);
343 int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
344 int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
345
346 if (n == 0)
347 tmp_n = vaddq_s16(tmp_n, h_n);
348 else if (n < 31)
349 {
350 int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
351 tmp_n = vaddq_s16(tmp_n, h_n_m);
352 }
353
354 tmp_n = vshrq_n_s16(tmp_n, 1);
355 int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
356 vst1q_s16(dst_ptr, dst_n);
357 l_ptr += 8;
358 h_ptr += 8;
359 dst_ptr += 8;
360 }
361
362 if (nDstCount > batchSize)
363 {
364 int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
365 int16_t tmp_n = h_n + 1;
366 if (n == 0)
367 tmp_n += h_n;
368 else if (n < 31)
369 tmp_n += *(h_ptr - nHighStep);
370 tmp_n >>= 1;
371 *dst_ptr = *l_ptr - tmp_n;
372 l_ptr += 1;
373 h_ptr += 1;
374 dst_ptr += 1;
375 }
376
377 dst_ptr += nDstStep;
378 }
379
380 if (forceBandSize < 32)
381 {
382 for (size_t x = 0; x < batchSize; x += 8)
383 {
384 int16x8_t l_n = vld1q_s16(l_ptr);
385 int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
386 int16x8_t tmp_n = vsubq_s16(l_n, h_n);
387 vst1q_s16(dst_ptr, tmp_n);
388 l_ptr += 8;
389 h_ptr += 8;
390 dst_ptr += 8;
391 }
392
393 if (nDstCount > batchSize)
394 {
395 *dst_ptr = *l_ptr - *(h_ptr - nHighStep);
396 l_ptr += 1;
397 h_ptr += 1;
398 dst_ptr += 1;
399 }
400 }
401
402 h_ptr = pHighBand;
403 dst_ptr = pDstBand + nDstStep;
404
405 /* Odd coefficients */
406 for (size_t n = 0; n < forceBandSize; n++)
407 {
408 for (size_t x = 0; x < batchSize; x += 8)
409 {
410 // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
411 int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
412 if (n == 31)
413 {
414 int16x8_t dst_n_p = vld1q_s16(l_ptr);
415 l_ptr += 8;
416 tmp_n = vaddq_s16(tmp_n, dst_n_p);
417 tmp_n = vshrq_n_s16(tmp_n, 1);
418 }
419 else
420 {
421 int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
422 tmp_n = vaddq_s16(tmp_n, dst_n_p);
423 tmp_n = vshrq_n_s16(tmp_n, 1);
424 int16x8_t h_n = vld1q_s16(h_ptr);
425 h_n = vshlq_n_s16(h_n, 1);
426 tmp_n = vaddq_s16(tmp_n, h_n);
427 }
428 vst1q_s16(dst_ptr, tmp_n);
429 h_ptr += 8;
430 dst_ptr += 8;
431 }
432
433 if (nDstCount > batchSize)
434 {
435 int16_t tmp_n = *(dst_ptr - nDstStep);
436 if (n == 31)
437 {
438 int16_t dst_n_p = *l_ptr;
439 l_ptr += 1;
440 tmp_n += dst_n_p;
441 tmp_n >>= 1;
442 }
443 else
444 {
445 int16_t dst_n_p = *(dst_ptr + nDstStep);
446 tmp_n += dst_n_p;
447 tmp_n >>= 1;
448 int16_t h_n = *h_ptr;
449 h_n <<= 1;
450 tmp_n += h_n;
451 }
452 *dst_ptr = tmp_n;
453 h_ptr += 1;
454 dst_ptr += 1;
455 }
456
457 dst_ptr += nDstStep;
458 }
459}
460
461static INLINE size_t prfx_get_band_l_count(size_t level)
462{
463 return (64 >> level) + 1;
464}
465
466static INLINE size_t prfx_get_band_h_count(size_t level)
467{
468 if (level == 1)
469 return (64 >> 1) - 1;
470 else
471 return (64 + (1 << (level - 1))) >> level;
472}
473
474static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
475 size_t level)
476{
477 size_t nDstStepX;
478 size_t nDstStepY;
479 INT16 *HL, *LH;
480 INT16 *HH, *LL;
481 INT16 *L, *H, *LLx;
482
483 const size_t nBandL = prfx_get_band_l_count(level);
484 const size_t nBandH = prfx_get_band_h_count(level);
485 size_t offset = 0;
486
487 WINPR_ASSERT(buffer);
488 WINPR_ASSERT(temp);
489
490 HL = &buffer[offset];
491 offset += (nBandH * nBandL);
492 LH = &buffer[offset];
493 offset += (nBandL * nBandH);
494 HH = &buffer[offset];
495 offset += (nBandH * nBandH);
496 LL = &buffer[offset];
497 nDstStepX = (nBandL + nBandH);
498 nDstStepY = (nBandL + nBandH);
499 offset = 0;
500 L = &temp[offset];
501 offset += (nBandL * nDstStepX);
502 H = &temp[offset];
503 LLx = &buffer[0];
504
505 /* horizontal (LL + HL -> L) */
506 rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
507
508 /* horizontal (LH + HH -> H) */
509 rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
510
511 /* vertical (L + H -> LL) */
512 rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
513 nBandL + nBandH);
514}
515
516static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
517{
518 WINPR_ASSERT(buffer);
519 WINPR_ASSERT(temp);
520 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
521 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
522 rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
523}
524#endif // NEON_INTRINSICS_ENABLED
525
526void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context)
527{
528#if defined(NEON_INTRINSICS_ENABLED)
529 DEBUG_RFX("Using NEON optimizations");
530 PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
531 PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON");
532 PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
533 context->quantization_decode = rfx_quantization_decode_NEON;
534 context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
535 context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
536#else
537 WINPR_UNUSED(context);
538#endif
539}