FreeRDP
Loading...
Searching...
No Matches
prim_templates.h
1/* prim_templates.h
2 * vi:ts=4 sw=4
3 *
4 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License. You may obtain
7 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing
12 * permissions and limitations under the License. Algorithms used by
13 * this code may be covered by patents by HP, Microsoft, or other parties.
14 */
15
16#pragma once
17
18#include "prim_avxsse.h"
19
20/* These are prototypes for SSE (potentially NEON) routines that do a
21 * simple SSE operation over an array of data. Since so much of this
22 * code is shared except for the operation itself, these prototypes are
23 * used rather than duplicating code. The naming convention depends on
24 * the parameters: S=Source param; C=Constant; D=Destination.
25 * All the macros have parameters for a fallback procedure if the data
26 * is too small and an operation "the slow way" for use at 16-byte edges.
27 */
28
29/* SSE3 note: If someone needs to support an SSE2 version of these without
30 * SSE3 support, an alternative version could be added that merely checks
31 * that 16-byte alignment on both destination and source(s) can be
32 * achieved, rather than use LDDQU for unaligned reads.
33 */
34
35/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
36 * It easily can't do that if the value is stored in a variable.
37 * So don't save it as an intermediate value.
38 */
39
40/* ----------------------------------------------------------------------------
41 * SCD = Source, Constant, Destination
42 */
43#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
45 _type_* WINPR_RESTRICT pDst, UINT32 ulen) \
46 { \
47 size_t len = ulen; \
48 INT32 shifts = 0; \
49 const _type_* sptr = pSrc; \
50 _type_* dptr = pDst; \
51 if (val == 0) \
52 return PRIMITIVES_SUCCESS; \
53 if (val >= 16) \
54 return -1; \
55 if (sizeof(_type_) == 1) \
56 shifts = 1; \
57 else if (sizeof(_type_) == 2) \
58 shifts = 2; \
59 else if (sizeof(_type_) == 4) \
60 shifts = 3; \
61 else if (sizeof(_type_) == 8) \
62 shifts = 4; \
63 /* Use 8 128-bit SSE registers. */ \
64 size_t count = len >> (8 - shifts); \
65 len -= count << (8 - shifts); \
66 \
67 while (count--) \
68 { \
69 __m128i xmm0 = LOAD_SI128(sptr); \
70 sptr += (16 / sizeof(_type_)); \
71 __m128i xmm1 = LOAD_SI128(sptr); \
72 sptr += (16 / sizeof(_type_)); \
73 __m128i xmm2 = LOAD_SI128(sptr); \
74 sptr += (16 / sizeof(_type_)); \
75 __m128i xmm3 = LOAD_SI128(sptr); \
76 sptr += (16 / sizeof(_type_)); \
77 __m128i xmm4 = LOAD_SI128(sptr); \
78 sptr += (16 / sizeof(_type_)); \
79 __m128i xmm5 = LOAD_SI128(sptr); \
80 sptr += (16 / sizeof(_type_)); \
81 __m128i xmm6 = LOAD_SI128(sptr); \
82 sptr += (16 / sizeof(_type_)); \
83 __m128i xmm7 = LOAD_SI128(sptr); \
84 sptr += (16 / sizeof(_type_)); \
85 xmm0 = _op_(xmm0, (_op_type_)val); \
86 xmm1 = _op_(xmm1, (_op_type_)val); \
87 xmm2 = _op_(xmm2, (_op_type_)val); \
88 xmm3 = _op_(xmm3, (_op_type_)val); \
89 xmm4 = _op_(xmm4, (_op_type_)val); \
90 xmm5 = _op_(xmm5, (_op_type_)val); \
91 xmm6 = _op_(xmm6, (_op_type_)val); \
92 xmm7 = _op_(xmm7, (_op_type_)val); \
93 STORE_SI128(dptr, xmm0); \
94 dptr += (16 / sizeof(_type_)); \
95 STORE_SI128(dptr, xmm1); \
96 dptr += (16 / sizeof(_type_)); \
97 STORE_SI128(dptr, xmm2); \
98 dptr += (16 / sizeof(_type_)); \
99 STORE_SI128(dptr, xmm3); \
100 dptr += (16 / sizeof(_type_)); \
101 STORE_SI128(dptr, xmm4); \
102 dptr += (16 / sizeof(_type_)); \
103 STORE_SI128(dptr, xmm5); \
104 dptr += (16 / sizeof(_type_)); \
105 STORE_SI128(dptr, xmm6); \
106 dptr += (16 / sizeof(_type_)); \
107 STORE_SI128(dptr, xmm7); \
108 dptr += (16 / sizeof(_type_)); \
109 } \
110 \
111 /* Use a single 128-bit SSE register. */ \
112 count = len >> (5 - shifts); \
113 len -= count << (5 - shifts); \
114 while (count--) \
115 { \
116 __m128i xmm0 = LOAD_SI128(sptr); \
117 sptr += (16 / sizeof(_type_)); \
118 xmm0 = _op_(xmm0, (_op_type_)val); \
119 STORE_SI128(dptr, xmm0); \
120 dptr += (16 / sizeof(_type_)); \
121 } \
122 /* Finish off the remainder. */ \
123 while (len--) \
124 { \
125 _slowWay_; \
126 } \
127 return PRIMITIVES_SUCCESS; \
128 }
129
130/* ----------------------------------------------------------------------------
131 * SCD = Source, Constant, Destination
132 * PRE = preload xmm0 with the constant.
133 */
134#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
135 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
136 _type_* WINPR_RESTRICT pDst, INT32 ilen) \
137 { \
138 size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
139 int shifts = 0; \
140 const _type_* sptr = pSrc; \
141 _type_* dptr = pDst; \
142 __m128i xmm0; \
143 if (sizeof(_type_) == 1) \
144 shifts = 1; \
145 else if (sizeof(_type_) == 2) \
146 shifts = 2; \
147 else if (sizeof(_type_) == 4) \
148 shifts = 3; \
149 else if (sizeof(_type_) == 8) \
150 shifts = 4; \
151 /* Use 4 128-bit SSE registers. */ \
152 size_t count = len >> (7 - shifts); \
153 len -= count << (7 - shifts); \
154 xmm0 = mm_set1_epu32(val); \
155 for (size_t x = 0; x < count; x++) \
156 { \
157 __m128i xmm1 = LOAD_SI128(sptr); \
158 sptr += (16 / sizeof(_type_)); \
159 __m128i xmm2 = LOAD_SI128(sptr); \
160 sptr += (16 / sizeof(_type_)); \
161 __m128i xmm3 = LOAD_SI128(sptr); \
162 sptr += (16 / sizeof(_type_)); \
163 __m128i xmm4 = LOAD_SI128(sptr); \
164 sptr += (16 / sizeof(_type_)); \
165 xmm1 = _op_(xmm1, xmm0); \
166 xmm2 = _op_(xmm2, xmm0); \
167 xmm3 = _op_(xmm3, xmm0); \
168 xmm4 = _op_(xmm4, xmm0); \
169 STORE_SI128(dptr, xmm1); \
170 dptr += (16 / sizeof(_type_)); \
171 STORE_SI128(dptr, xmm2); \
172 dptr += (16 / sizeof(_type_)); \
173 STORE_SI128(dptr, xmm3); \
174 dptr += (16 / sizeof(_type_)); \
175 STORE_SI128(dptr, xmm4); \
176 dptr += (16 / sizeof(_type_)); \
177 } \
178 /* Use a single 128-bit SSE register. */ \
179 count = len >> (5 - shifts); \
180 len -= count << (5 - shifts); \
181 for (size_t x = 0; x < count; x++) \
182 { \
183 __m128i xmm1 = LOAD_SI128(sptr); \
184 sptr += (16 / sizeof(_type_)); \
185 xmm1 = _op_(xmm1, xmm0); \
186 STORE_SI128(dptr, xmm1); \
187 dptr += (16 / sizeof(_type_)); \
188 } \
189 /* Finish off the remainder. */ \
190 for (size_t x = 0; x < len; x++) \
191 { \
192 _slowWay_; \
193 } \
194 return PRIMITIVES_SUCCESS; \
195 }
196
197/* ----------------------------------------------------------------------------
198 * SSD = Source1, Source2, Destination
199 */
200#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
202 const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
203 UINT32 ulen) \
204 { \
205 size_t len = ulen; \
206 int shifts = 0; \
207 const _type_* sptr1 = pSrc1; \
208 const _type_* sptr2 = pSrc2; \
209 _type_* dptr = pDst; \
210 size_t count; \
211 if (sizeof(_type_) == 1) \
212 shifts = 1; \
213 else if (sizeof(_type_) == 2) \
214 shifts = 2; \
215 else if (sizeof(_type_) == 4) \
216 shifts = 3; \
217 else if (sizeof(_type_) == 8) \
218 shifts = 4; \
219 /* Use 4 128-bit SSE registers. */ \
220 count = len >> (7 - shifts); \
221 len -= count << (7 - shifts); \
222 /* Aligned loads */ \
223 while (count--) \
224 { \
225 __m128i xmm0 = LOAD_SI128(sptr1); \
226 sptr1 += (16 / sizeof(_type_)); \
227 __m128i xmm1 = LOAD_SI128(sptr1); \
228 sptr1 += (16 / sizeof(_type_)); \
229 __m128i xmm2 = LOAD_SI128(sptr1); \
230 sptr1 += (16 / sizeof(_type_)); \
231 __m128i xmm3 = LOAD_SI128(sptr1); \
232 sptr1 += (16 / sizeof(_type_)); \
233 __m128i xmm4 = LOAD_SI128(sptr2); \
234 sptr2 += (16 / sizeof(_type_)); \
235 __m128i xmm5 = LOAD_SI128(sptr2); \
236 sptr2 += (16 / sizeof(_type_)); \
237 __m128i xmm6 = LOAD_SI128(sptr2); \
238 sptr2 += (16 / sizeof(_type_)); \
239 __m128i xmm7 = LOAD_SI128(sptr2); \
240 sptr2 += (16 / sizeof(_type_)); \
241 xmm0 = _op_(xmm0, xmm4); \
242 xmm1 = _op_(xmm1, xmm5); \
243 xmm2 = _op_(xmm2, xmm6); \
244 xmm3 = _op_(xmm3, xmm7); \
245 STORE_SI128(dptr, xmm0); \
246 dptr += (16 / sizeof(_type_)); \
247 STORE_SI128(dptr, xmm1); \
248 dptr += (16 / sizeof(_type_)); \
249 STORE_SI128(dptr, xmm2); \
250 dptr += (16 / sizeof(_type_)); \
251 STORE_SI128(dptr, xmm3); \
252 dptr += (16 / sizeof(_type_)); \
253 } \
254 /* Use a single 128-bit SSE register. */ \
255 count = len >> (5 - shifts); \
256 len -= count << (5 - shifts); \
257 while (count--) \
258 { \
259 __m128i xmm0 = LOAD_SI128(sptr1); \
260 sptr1 += (16 / sizeof(_type_)); \
261 __m128i xmm1 = LOAD_SI128(sptr2); \
262 sptr2 += (16 / sizeof(_type_)); \
263 xmm0 = _op_(xmm0, xmm1); \
264 STORE_SI128(dptr, xmm0); \
265 dptr += (16 / sizeof(_type_)); \
266 } \
267 /* Finish off the remainder. */ \
268 while (len--) \
269 { \
270 _slowWay_; \
271 } \
272 return PRIMITIVES_SUCCESS; \
273 }