FreeRDP
Loading...
Searching...
No Matches
prim_colors_neon.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized Color conversion operations.
3 * vi:ts=4 sw=4:
4 *
5 * Copyright 2011 Stephen Erisman
6 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License"); you may
11 * not use this file except in compliance with the License. You may obtain
12 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16 * or implied. See the License for the specific language governing
17 * permissions and limitations under the License.
18 */
19
20#include <freerdp/config.h>
21
22#include <freerdp/types.h>
23#include <freerdp/primitives.h>
24#include <winpr/sysinfo.h>
25
26#include "prim_internal.h"
27#include "prim_colors.h"
28
29/*---------------------------------------------------------------------------*/
30#if defined(NEON_INTRINSICS_ENABLED)
31#include <arm_neon.h>
32
33static primitives_t* generic = NULL;
34
35static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
36 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
37 const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
38 uint8_t gPos, uint8_t bPos, uint8_t aPos)
39{
40 BYTE* pRGB = pDst;
41 const INT16* pY = pSrc[0];
42 const INT16* pCb = pSrc[1];
43 const INT16* pCr = pSrc[2];
44 const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
45 const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
46 const size_t pad = roi->width % 8;
47 const int16x4_t c4096 = vdup_n_s16(4096);
48
49 for (UINT32 y = 0; y < roi->height; y++)
50 {
51 for (UINT32 x = 0; x < roi->width - pad; x += 8)
52 {
53 const int16x8_t Y = vld1q_s16(pY);
54 const int16x4_t Yh = vget_high_s16(Y);
55 const int16x4_t Yl = vget_low_s16(Y);
56 const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
57 const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
58 const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
59 const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
60 const int16x8_t Cr = vld1q_s16(pCr);
61 const int16x4_t Crh = vget_high_s16(Cr);
62 const int16x4_t Crl = vget_low_s16(Cr);
63 const int16x8_t Cb = vld1q_s16(pCb);
64 const int16x4_t Cbh = vget_high_s16(Cb);
65 const int16x4_t Cbl = vget_low_s16(Cb);
66 uint8x8x4_t bgrx;
67 {
68 /* R */
69 const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
70 const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
71 const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
72 const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
73 const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
74 const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
75 const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
76 bgrx.val[rPos] = vqmovun_s16(Rs);
77 }
78 {
79 /* G */
80 const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
81 const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
82 const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
83 const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
84 const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
85 const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
86 const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
87 const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
88 const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
89 const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
90 const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
91 const uint8x8_t G = vqmovun_s16(Gs);
92 bgrx.val[gPos] = G;
93 }
94 {
95 /* B */
96 const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
97 const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
98 const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
99 const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
100 const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
101 const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
102 const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
103 const uint8x8_t B = vqmovun_s16(Bs);
104 bgrx.val[bPos] = B;
105 }
106 /* A */
107 {
108 bgrx.val[aPos] = vdup_n_u8(0xFF);
109 }
110 vst4_u8(pRGB, bgrx);
111 pY += 8;
112 pCb += 8;
113 pCr += 8;
114 pRGB += 32;
115 }
116
117 for (UINT32 x = 0; x < pad; x++)
118 {
119 const INT32 divisor = 16;
120 const INT32 Y = ((*pY++) + 4096) << divisor;
121 const INT32 Cb = (*pCb++);
122 const INT32 Cr = (*pCr++);
123 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
124 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
125 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
126 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
127 INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
128 INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
129 INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
130 BYTE bgrx[4];
131 bgrx[bPos] = CLIP(B);
132 bgrx[gPos] = CLIP(G);
133 bgrx[rPos] = CLIP(R);
134 bgrx[aPos] = 0xFF;
135 *pRGB++ = bgrx[0];
136 *pRGB++ = bgrx[1];
137 *pRGB++ = bgrx[2];
138 *pRGB++ = bgrx[3];
139 }
140
141 pY += srcPad;
142 pCb += srcPad;
143 pCr += srcPad;
144 pRGB += dstPad;
145 }
146
147 return PRIMITIVES_SUCCESS;
148}
149
150static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep,
151 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
152 UINT32 DstFormat,
153 const prim_size_t* WINPR_RESTRICT roi)
154{
155 switch (DstFormat)
156 {
157 case PIXEL_FORMAT_BGRA32:
158 case PIXEL_FORMAT_BGRX32:
159 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
160
161 case PIXEL_FORMAT_RGBA32:
162 case PIXEL_FORMAT_RGBX32:
163 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
164
165 case PIXEL_FORMAT_ARGB32:
166 case PIXEL_FORMAT_XRGB32:
167 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
168
169 case PIXEL_FORMAT_ABGR32:
170 case PIXEL_FORMAT_XBGR32:
171 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
172
173 default:
174 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
175 }
176}
177
178static pstatus_t
179neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
180 UINT32 srcStep, /* bytes between rows in source data */
181 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
182 UINT32 dstStep, /* bytes between rows in dest data */
183 const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
184 uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
185{
186 UINT32 pad = roi->width % 8;
187
188 for (UINT32 y = 0; y < roi->height; y++)
189 {
190 const INT16* pr = (const INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
191 const INT16* pg = (const INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
192 const INT16* pb = (const INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
193 BYTE* dst = pDst + y * dstStep;
194
195 for (UINT32 x = 0; x < roi->width - pad; x += 8)
196 {
197 int16x8_t r = vld1q_s16(pr);
198 int16x8_t g = vld1q_s16(pg);
199 int16x8_t b = vld1q_s16(pb);
200 uint8x8x4_t bgrx;
201 bgrx.val[aPos] = vdup_n_u8(0xFF);
202 bgrx.val[rPos] = vqmovun_s16(r);
203 bgrx.val[gPos] = vqmovun_s16(g);
204 bgrx.val[bPos] = vqmovun_s16(b);
205 vst4_u8(dst, bgrx);
206 pr += 8;
207 pg += 8;
208 pb += 8;
209 dst += 32;
210 }
211
212 for (UINT32 x = 0; x < pad; x++)
213 {
214 BYTE bgrx[4];
215 bgrx[bPos] = *pb++;
216 bgrx[gPos] = *pg++;
217 bgrx[rPos] = *pr++;
218 bgrx[aPos] = 0xFF;
219 *dst++ = bgrx[0];
220 *dst++ = bgrx[1];
221 *dst++ = bgrx[2];
222 *dst++ = bgrx[3];
223 }
224 }
225
226 return PRIMITIVES_SUCCESS;
227}
228
229static pstatus_t
230neon_RGBToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
231 UINT32 srcStep, /* bytes between rows in source data */
232 BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
233 UINT32 dstStep, /* bytes between rows in dest data */
234 UINT32 DstFormat,
235 const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
236{
237 switch (DstFormat)
238 {
239 case PIXEL_FORMAT_BGRA32:
240 case PIXEL_FORMAT_BGRX32:
241 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
242
243 case PIXEL_FORMAT_RGBA32:
244 case PIXEL_FORMAT_RGBX32:
245 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
246
247 case PIXEL_FORMAT_ARGB32:
248 case PIXEL_FORMAT_XRGB32:
249 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
250
251 case PIXEL_FORMAT_ABGR32:
252 case PIXEL_FORMAT_XBGR32:
253 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
254
255 default:
256 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
257 }
258}
259#endif /* NEON_INTRINSICS_ENABLED */
260
261/* ------------------------------------------------------------------------- */
262void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims)
263{
264#if defined(NEON_INTRINSICS_ENABLED)
265 generic = primitives_get_generic();
266
267 WLog_VRB(PRIM_TAG, "NEON optimizations");
268 prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
269 prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
270#else
271 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or neon intrinsics not available");
272 WINPR_UNUSED(prims);
273#endif
274}