FreeRDP
Loading...
Searching...
No Matches
prim_copy_sse4_1.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Copy operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 */
15
16#include <winpr/sysinfo.h>
17
18#include <freerdp/config.h>
19
20#include <string.h>
21#include <freerdp/types.h>
22#include <freerdp/primitives.h>
23#include <freerdp/log.h>
24
25#include "prim_internal.h"
26#include "prim_avxsse.h"
27#include "prim_copy.h"
28#include "../codec/color.h"
29
30#include <freerdp/codec/color.h>
31
32#if defined(SSE_AVX_INTRINSICS_ENABLED)
33#include <emmintrin.h>
34#include <immintrin.h>
35
36static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
37 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
38 UINT32 nHeight,
39 const BYTE* WINPR_RESTRICT pSrcData,
40 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
41 int64_t srcVMultiplier, int64_t srcVOffset,
42 int64_t dstVMultiplier, int64_t dstVOffset)
43{
44
45 const int64_t srcByte = 3;
46 const int64_t dstByte = 4;
47
48 const __m128i mask = mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
49 const __m128i smask = mm_set_epu32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
50 const UINT32 rem = nWidth % 4;
51
52 const int64_t width = nWidth - rem;
53 for (int64_t y = 0; y < nHeight; y++)
54 {
55 const BYTE* WINPR_RESTRICT srcLine =
56 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
57 BYTE* WINPR_RESTRICT dstLine =
58 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
59
60 int64_t x = 0;
61 /* Ensure alignment requirements can be met */
62 for (; x < width; x += 4)
63 {
64 const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
65 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
66 const __m128i s0 = LOAD_SI128(src);
67 const __m128i s1 = _mm_shuffle_epi8(s0, smask);
68 const __m128i s2 = LOAD_SI128(dst);
69
70 __m128i d0 = _mm_blendv_epi8(s1, s2, mask);
71 STORE_SI128(dst, d0);
72 }
73
74 for (; x < nWidth; x++)
75 {
76 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
77 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
78 *dst++ = *src++;
79 *dst++ = *src++;
80 *dst++ = *src++;
81 }
82 }
83
84 return PRIMITIVES_SUCCESS;
85}
86
87static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
88 UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
89 UINT32 nHeight,
90 const BYTE* WINPR_RESTRICT pSrcData,
91 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
92 int64_t srcVMultiplier, int64_t srcVOffset,
93 int64_t dstVMultiplier, int64_t dstVOffset)
94{
95
96 const int64_t srcByte = 4;
97 const int64_t dstByte = 4;
98
99 const __m128i mask = _mm_setr_epi8((char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF,
100 (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF,
101 (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
102 const UINT32 rem = nWidth % 4;
103 const int64_t width = nWidth - rem;
104 for (int64_t y = 0; y < nHeight; y++)
105 {
106 const BYTE* WINPR_RESTRICT srcLine =
107 &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
108 BYTE* WINPR_RESTRICT dstLine =
109 &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
110
111 int64_t x = 0;
112 for (; x < width; x += 4)
113 {
114 const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
115 __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
116 const __m128i s0 = LOAD_SI128(src);
117 const __m128i s1 = LOAD_SI128(dst);
118 __m128i d0 = _mm_blendv_epi8(s1, s0, mask);
119 STORE_SI128(dst, d0);
120 }
121
122 for (; x < nWidth; x++)
123 {
124 const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
125 BYTE* dst = &dstLine[(x + nXDst) * dstByte];
126 *dst++ = *src++;
127 *dst++ = *src++;
128 *dst++ = *src++;
129 }
130 }
131
132 return PRIMITIVES_SUCCESS;
133}
134
135static pstatus_t sse_image_copy_no_overlap_dst_alpha(
136 BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
137 UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
138 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
139 UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
140 int64_t dstVOffset)
141{
142 WINPR_ASSERT(pDstData);
143 WINPR_ASSERT(pSrcData);
144
145 switch (SrcFormat)
146 {
147 case PIXEL_FORMAT_BGR24:
148 switch (DstFormat)
149 {
150 case PIXEL_FORMAT_BGRX32:
151 case PIXEL_FORMAT_BGRA32:
152 return sse_image_copy_bgr24_bgrx32(
153 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
154 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
155 default:
156 break;
157 }
158 break;
159 case PIXEL_FORMAT_BGRX32:
160 case PIXEL_FORMAT_BGRA32:
161 switch (DstFormat)
162 {
163 case PIXEL_FORMAT_BGRX32:
164 case PIXEL_FORMAT_BGRA32:
165 return sse_image_copy_bgrx32_bgrx32(
166 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
167 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
168 default:
169 break;
170 }
171 break;
172 case PIXEL_FORMAT_RGBX32:
173 case PIXEL_FORMAT_RGBA32:
174 switch (DstFormat)
175 {
176 case PIXEL_FORMAT_RGBX32:
177 case PIXEL_FORMAT_RGBA32:
178 return sse_image_copy_bgrx32_bgrx32(
179 pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
180 nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
181 default:
182 break;
183 }
184 break;
185 default:
186 break;
187 }
188
189 primitives_t* gen = primitives_get_generic();
190 return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
191 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
192}
193
194static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
195 UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
196 UINT32 nWidth, UINT32 nHeight,
197 const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
198 UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
199 const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
200{
201 const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
202 int64_t srcVOffset = 0;
203 int64_t srcVMultiplier = 1;
204 int64_t dstVOffset = 0;
205 int64_t dstVMultiplier = 1;
206
207 if ((nWidth == 0) || (nHeight == 0))
208 return PRIMITIVES_SUCCESS;
209
210 if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
211 return -1;
212
213 if (!pDstData || !pSrcData)
214 return -1;
215
216 if (nDstStep == 0)
217 nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
218
219 if (nSrcStep == 0)
220 nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
221
222 if (vSrcVFlip)
223 {
224 srcVOffset = (nHeight - 1ll) * nSrcStep;
225 srcVMultiplier = -1;
226 }
227
228 if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
229 return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
230 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
231 nXSrc, nYSrc, palette, flags, srcVMultiplier,
232 srcVOffset, dstVMultiplier, dstVOffset);
233 else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
234 return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
235 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
236 nXSrc, nYSrc, palette, srcVMultiplier,
237 srcVOffset, dstVMultiplier, dstVOffset, flags);
238 else
239 {
240 primitives_t* gen = primitives_get_generic();
241 return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
242 pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
243 }
244}
245#endif
246
247/* ------------------------------------------------------------------------- */
248void primitives_init_copy_sse41_int(primitives_t* WINPR_RESTRICT prims)
249{
250#if defined(SSE_AVX_INTRINSICS_ENABLED)
251 WLog_VRB(PRIM_TAG, "SSE4.1 optimizations");
252 prims->copy_no_overlap = sse_image_copy_no_overlap;
253#else
254 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE4.1 intrinsics not available");
255 WINPR_UNUSED(prims);
256#endif
257}
fn_copy_no_overlap_t copy_no_overlap
Definition primitives.h:304