20#include <freerdp/config.h>
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
26#include "prim_internal.h"
27#include "prim_colors.h"
30#if defined(NEON_INTRINSICS_ENABLED)
35static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
36 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
37 const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
38 uint8_t gPos, uint8_t bPos, uint8_t aPos)
41 const INT16* pY = pSrc[0];
42 const INT16* pCb = pSrc[1];
43 const INT16* pCr = pSrc[2];
44 const size_t srcPad = (srcStep - (roi->width *
sizeof(INT16))) /
sizeof(INT16);
45 const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
46 const size_t pad = roi->width % 8;
47 const int16x4_t c4096 = vdup_n_s16(4096);
49 for (UINT32 y = 0; y < roi->height; y++)
51 for (UINT32 x = 0; x < roi->width - pad; x += 8)
53 const int16x8_t Y = vld1q_s16(pY);
54 const int16x4_t Yh = vget_high_s16(Y);
55 const int16x4_t Yl = vget_low_s16(Y);
56 const int32x4_t YhAdd = vaddl_s16(Yh, c4096);
57 const int32x4_t YlAdd = vaddl_s16(Yl, c4096);
58 const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
59 const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
60 const int16x8_t Cr = vld1q_s16(pCr);
61 const int16x4_t Crh = vget_high_s16(Cr);
62 const int16x4_t Crl = vget_low_s16(Cr);
63 const int16x8_t Cb = vld1q_s16(pCb);
64 const int16x4_t Cbh = vget_high_s16(Cb);
65 const int16x4_t Cbl = vget_low_s16(Cb);
69 const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916);
70 const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916);
71 const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
72 const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
73 const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
74 const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
75 const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
76 bgrx.val[rPos] = vqmovun_s16(Rs);
80 const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);
81 const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);
82 const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819);
83 const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819);
84 const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
85 const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
86 const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
87 const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
88 const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
89 const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
90 const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
91 const uint8x8_t G = vqmovun_s16(Gs);
96 const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992);
97 const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992);
98 const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
99 const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
100 const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
101 const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
102 const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
103 const uint8x8_t B = vqmovun_s16(Bs);
108 bgrx.val[aPos] = vdup_n_u8(0xFF);
117 for (UINT32 x = 0; x < pad; x++)
119 const INT32 divisor = 16;
120 const INT32 Y = ((*pY++) + 4096) << divisor;
121 const INT32 Cb = (*pCb++);
122 const INT32 Cr = (*pCr++);
123 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
124 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
125 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
126 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
127 INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
128 INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
129 INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
131 bgrx[bPos] = CLIP(B);
132 bgrx[gPos] = CLIP(G);
133 bgrx[rPos] = CLIP(R);
147 return PRIMITIVES_SUCCESS;
150static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
151 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
157 case PIXEL_FORMAT_BGRA32:
158 case PIXEL_FORMAT_BGRX32:
159 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
161 case PIXEL_FORMAT_RGBA32:
162 case PIXEL_FORMAT_RGBX32:
163 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
165 case PIXEL_FORMAT_ARGB32:
166 case PIXEL_FORMAT_XRGB32:
167 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
169 case PIXEL_FORMAT_ABGR32:
170 case PIXEL_FORMAT_XBGR32:
171 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
174 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
179neon_RGBToRGB_16s8u_P3AC4R_X(
const INT16* WINPR_RESTRICT pSrc[3],
181 BYTE* WINPR_RESTRICT pDst,
184 uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
186 UINT32 pad = roi->width % 8;
188 for (UINT32 y = 0; y < roi->height; y++)
190 const INT16* pr = (
const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
191 const INT16* pg = (
const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
192 const INT16* pb = (
const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
193 BYTE* dst = pDst + y * dstStep;
195 for (UINT32 x = 0; x < roi->width - pad; x += 8)
197 int16x8_t r = vld1q_s16(pr);
198 int16x8_t g = vld1q_s16(pg);
199 int16x8_t b = vld1q_s16(pb);
201 bgrx.val[aPos] = vdup_n_u8(0xFF);
202 bgrx.val[rPos] = vqmovun_s16(r);
203 bgrx.val[gPos] = vqmovun_s16(g);
204 bgrx.val[bPos] = vqmovun_s16(b);
212 for (UINT32 x = 0; x < pad; x++)
226 return PRIMITIVES_SUCCESS;
230neon_RGBToRGB_16s8u_P3AC4R(
const INT16* WINPR_RESTRICT pSrc[3],
232 BYTE* WINPR_RESTRICT pDst,
239 case PIXEL_FORMAT_BGRA32:
240 case PIXEL_FORMAT_BGRX32:
241 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
243 case PIXEL_FORMAT_RGBA32:
244 case PIXEL_FORMAT_RGBX32:
245 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
247 case PIXEL_FORMAT_ARGB32:
248 case PIXEL_FORMAT_XRGB32:
249 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
251 case PIXEL_FORMAT_ABGR32:
252 case PIXEL_FORMAT_XBGR32:
253 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
256 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
262void primitives_init_colors_neon_int(
primitives_t* WINPR_RESTRICT prims)
264#if defined(NEON_INTRINSICS_ENABLED)
265 generic = primitives_get_generic();
267 WLog_VRB(PRIM_TAG,
"NEON optimizations");
268 prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
269 prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
271 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or neon intrinsics not available");