FreeRDP
Loading...
Searching...
No Matches
prim_copy_avx2.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Copy operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 */
15
16#include <winpr/sysinfo.h>
17
18#include <freerdp/config.h>
19
20#include <string.h>
21#include <freerdp/types.h>
22#include <freerdp/primitives.h>
23#include <freerdp/log.h>
24
25#include "prim_internal.h"
26#include "prim_copy.h"
27#include "../codec/color.h"
28
29#include <freerdp/codec/color.h>
30
31#if defined(SSE_AVX_INTRINSICS_ENABLED)
32#include <emmintrin.h>
33#include <immintrin.h>
34
35static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
36 uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
37{
38 return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
39 (int32_t)i5, (int32_t)i6, (int32_t)i7);
40}
41
42static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
43 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
44 UINT32 nHeight,
45 const BYTE* WINPR_RESTRICT pSrcData,
46 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
47 int64_t srcVMultiplier, int64_t srcVOffset,
48 int64_t dstVMultiplier, int64_t dstVOffset)
49{
50
51 const int64_t srcByte = 3;
52 const int64_t dstByte = 4;
53
54 const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
55 0xFF000000, 0xFF000000, 0xFF000000);
56 const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
57 0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
58 const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
59 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
60 const UINT32 rem = nWidth % 8;
61 const int64_t width = nWidth - rem;
62
63 for (int64_t y = 0; y < nHeight; y++)
64 {
65 const BYTE* WINPR_RESTRICT srcLine =
66 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
67 BYTE* WINPR_RESTRICT dstLine =
68 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
69
70 int64_t x = 0;
71
72 /* Ensure alignment requirements can be met */
73 for (; x < width; x += 8)
74 {
75 const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
76 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
77 const __m256i s0 = _mm256_loadu_si256(src);
78 __m256i s1 = _mm256_shuffle_epi8(s0, smask);
79
80 /* _mm256_shuffle_epi8 can not cross 128bit lanes.
81 * manually copy these bytes with extract/insert */
82 const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
83 const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
84 const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
85 0x00000000, 0x00000000, 0x00000000, 0x00000000);
86 const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
87
88 const __m256i s2 = _mm256_loadu_si256(dst);
89 __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
90 _mm256_storeu_si256(dst, d0);
91 }
92
93 for (; x < nWidth; x++)
94 {
95 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
96 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
97 *dst++ = *src++;
98 *dst++ = *src++;
99 *dst++ = *src++;
100 }
101 }
102
103 return PRIMITIVES_SUCCESS;
104}
105
106static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
107 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
108 UINT32 nWidth, UINT32 nHeight,
109 const BYTE* WINPR_RESTRICT pSrcData,
110 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
111 int64_t srcVMultiplier, int64_t srcVOffset,
112 int64_t dstVMultiplier, int64_t dstVOffset)
113{
114
115 const int64_t srcByte = 4;
116 const int64_t dstByte = 4;
117
118 const __m256i mask = _mm256_setr_epi8(
119 (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
120 (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
121 (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
122 (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
123 const UINT32 rem = nWidth % 8;
124 const int64_t width = nWidth - rem;
125 for (int64_t y = 0; y < nHeight; y++)
126 {
127 const BYTE* WINPR_RESTRICT srcLine =
128 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
129 BYTE* WINPR_RESTRICT dstLine =
130 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
131
132 int64_t x = 0;
133 for (; x < width; x += 8)
134 {
135 const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
136 __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
137 const __m256i s0 = _mm256_loadu_si256(src);
138 const __m256i s1 = _mm256_loadu_si256(dst);
139 __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
140 _mm256_storeu_si256(dst, d0);
141 }
142
143 for (; x < nWidth; x++)
144 {
145 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
146 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
147 *dst++ = *src++;
148 *dst++ = *src++;
149 *dst++ = *src++;
150 }
151 }
152
153 return PRIMITIVES_SUCCESS;
154}
155
156static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
157 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
158 UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
159 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
160 UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
161 int64_t dstVOffset)
162{
163 WINPR_ASSERT(pDstData);
164 WINPR_ASSERT(pSrcData);
165
166 switch (SrcFormat)
167 {
168 case PIXEL_FORMAT_BGR24:
169 switch (DstFormat)
170 {
171 case PIXEL_FORMAT_BGRX32:
172 case PIXEL_FORMAT_BGRA32:
173 return avx2_image_copy_bgr24_bgrx32(
174 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
175 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
176 default:
177 break;
178 }
179 break;
180 case PIXEL_FORMAT_BGRX32:
181 case PIXEL_FORMAT_BGRA32:
182 switch (DstFormat)
183 {
184 case PIXEL_FORMAT_BGRX32:
185 case PIXEL_FORMAT_BGRA32:
186 return avx2_image_copy_bgrx32_bgrx32(
187 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
188 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
189 default:
190 break;
191 }
192 break;
193 case PIXEL_FORMAT_RGBX32:
194 case PIXEL_FORMAT_RGBA32:
195 switch (DstFormat)
196 {
197 case PIXEL_FORMAT_RGBX32:
198 case PIXEL_FORMAT_RGBA32:
199 return avx2_image_copy_bgrx32_bgrx32(
200 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
201 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
202 default:
203 break;
204 }
205 break;
206 default:
207 break;
208 }
209
210 primitives_t* gen = primitives_get_generic();
211 return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
212 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
213}
214
215static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
216 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
217 UINT32 nWidth, UINT32 nHeight,
218 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
219 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
220 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
221{
222 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
223 int64_t srcVOffset = 0;
224 int64_t srcVMultiplier = 1;
225 int64_t dstVOffset = 0;
226 int64_t dstVMultiplier = 1;
227
228 if ((nWidth == 0) || (nHeight == 0))
229 return PRIMITIVES_SUCCESS;
230
231 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
232 return -1;
233
234 if (!pDstData || !pSrcData)
235 return -1;
236
237 if (nDstStep == 0)
238 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
239
240 if (nSrcStep == 0)
241 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
242
243 if (vSrcVFlip)
244 {
245 srcVOffset = (nHeight - 1ll) * nSrcStep;
246 srcVMultiplier = -1;
247 }
248
249 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
250 return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
251 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
252 nXSrc, nYSrc, palette, flags, srcVMultiplier,
253 srcVOffset, dstVMultiplier, dstVOffset);
254 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
255 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
256 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
257 nXSrc, nYSrc, palette, srcVMultiplier,
258 srcVOffset, dstVMultiplier, dstVOffset, flags);
259 else
260 {
261 primitives_t* gen = primitives_get_generic();
262 return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
263 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
264 }
265}
266#endif
267
268/* ------------------------------------------------------------------------- */
269void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
270{
271#if defined(SSE_AVX_INTRINSICS_ENABLED)
272 WLog_VRB(PRIM_TAG, "AVX2 optimizations");
273 prims->copy_no_overlap = avx2_image_copy_no_overlap;
274#else
275 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
276 WINPR_UNUSED(prims);
277#endif
278}
fn_copy_no_overlap_t copy_no_overlap
Definition primitives.h:304