 |
FreeRDP
|
Loading...
Searching...
No Matches
18#include "prim_avxsse.h"
43#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
45 _type_* WINPR_RESTRICT pDst, UINT32 ulen) \
49 const _type_* sptr = pSrc; \
50 _type_* dptr = pDst; \
52 return PRIMITIVES_SUCCESS; \
55 if (sizeof(_type_) == 1) \
57 else if (sizeof(_type_) == 2) \
59 else if (sizeof(_type_) == 4) \
61 else if (sizeof(_type_) == 8) \
64 size_t count = len >> (8 - shifts); \
65 len -= count << (8 - shifts); \
69 __m128i xmm0 = LOAD_SI128(sptr); \
70 sptr += (16 / sizeof(_type_)); \
71 __m128i xmm1 = LOAD_SI128(sptr); \
72 sptr += (16 / sizeof(_type_)); \
73 __m128i xmm2 = LOAD_SI128(sptr); \
74 sptr += (16 / sizeof(_type_)); \
75 __m128i xmm3 = LOAD_SI128(sptr); \
76 sptr += (16 / sizeof(_type_)); \
77 __m128i xmm4 = LOAD_SI128(sptr); \
78 sptr += (16 / sizeof(_type_)); \
79 __m128i xmm5 = LOAD_SI128(sptr); \
80 sptr += (16 / sizeof(_type_)); \
81 __m128i xmm6 = LOAD_SI128(sptr); \
82 sptr += (16 / sizeof(_type_)); \
83 __m128i xmm7 = LOAD_SI128(sptr); \
84 sptr += (16 / sizeof(_type_)); \
85 xmm0 = _op_(xmm0, (_op_type_)val); \
86 xmm1 = _op_(xmm1, (_op_type_)val); \
87 xmm2 = _op_(xmm2, (_op_type_)val); \
88 xmm3 = _op_(xmm3, (_op_type_)val); \
89 xmm4 = _op_(xmm4, (_op_type_)val); \
90 xmm5 = _op_(xmm5, (_op_type_)val); \
91 xmm6 = _op_(xmm6, (_op_type_)val); \
92 xmm7 = _op_(xmm7, (_op_type_)val); \
93 STORE_SI128(dptr, xmm0); \
94 dptr += (16 / sizeof(_type_)); \
95 STORE_SI128(dptr, xmm1); \
96 dptr += (16 / sizeof(_type_)); \
97 STORE_SI128(dptr, xmm2); \
98 dptr += (16 / sizeof(_type_)); \
99 STORE_SI128(dptr, xmm3); \
100 dptr += (16 / sizeof(_type_)); \
101 STORE_SI128(dptr, xmm4); \
102 dptr += (16 / sizeof(_type_)); \
103 STORE_SI128(dptr, xmm5); \
104 dptr += (16 / sizeof(_type_)); \
105 STORE_SI128(dptr, xmm6); \
106 dptr += (16 / sizeof(_type_)); \
107 STORE_SI128(dptr, xmm7); \
108 dptr += (16 / sizeof(_type_)); \
112 count = len >> (5 - shifts); \
113 len -= count << (5 - shifts); \
116 __m128i xmm0 = LOAD_SI128(sptr); \
117 sptr += (16 / sizeof(_type_)); \
118 xmm0 = _op_(xmm0, (_op_type_)val); \
119 STORE_SI128(dptr, xmm0); \
120 dptr += (16 / sizeof(_type_)); \
127 return PRIMITIVES_SUCCESS; \
134#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
135 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
136 _type_* WINPR_RESTRICT pDst, INT32 ilen) \
138 size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
140 const _type_* sptr = pSrc; \
141 _type_* dptr = pDst; \
143 if (sizeof(_type_) == 1) \
145 else if (sizeof(_type_) == 2) \
147 else if (sizeof(_type_) == 4) \
149 else if (sizeof(_type_) == 8) \
152 size_t count = len >> (7 - shifts); \
153 len -= count << (7 - shifts); \
154 xmm0 = mm_set1_epu32(val); \
155 for (size_t x = 0; x < count; x++) \
157 __m128i xmm1 = LOAD_SI128(sptr); \
158 sptr += (16 / sizeof(_type_)); \
159 __m128i xmm2 = LOAD_SI128(sptr); \
160 sptr += (16 / sizeof(_type_)); \
161 __m128i xmm3 = LOAD_SI128(sptr); \
162 sptr += (16 / sizeof(_type_)); \
163 __m128i xmm4 = LOAD_SI128(sptr); \
164 sptr += (16 / sizeof(_type_)); \
165 xmm1 = _op_(xmm1, xmm0); \
166 xmm2 = _op_(xmm2, xmm0); \
167 xmm3 = _op_(xmm3, xmm0); \
168 xmm4 = _op_(xmm4, xmm0); \
169 STORE_SI128(dptr, xmm1); \
170 dptr += (16 / sizeof(_type_)); \
171 STORE_SI128(dptr, xmm2); \
172 dptr += (16 / sizeof(_type_)); \
173 STORE_SI128(dptr, xmm3); \
174 dptr += (16 / sizeof(_type_)); \
175 STORE_SI128(dptr, xmm4); \
176 dptr += (16 / sizeof(_type_)); \
179 count = len >> (5 - shifts); \
180 len -= count << (5 - shifts); \
181 for (size_t x = 0; x < count; x++) \
183 __m128i xmm1 = LOAD_SI128(sptr); \
184 sptr += (16 / sizeof(_type_)); \
185 xmm1 = _op_(xmm1, xmm0); \
186 STORE_SI128(dptr, xmm1); \
187 dptr += (16 / sizeof(_type_)); \
190 for (size_t x = 0; x < len; x++) \
194 return PRIMITIVES_SUCCESS; \
200#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201 static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
202 const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
207 const _type_* sptr1 = pSrc1; \
208 const _type_* sptr2 = pSrc2; \
209 _type_* dptr = pDst; \
211 if (sizeof(_type_) == 1) \
213 else if (sizeof(_type_) == 2) \
215 else if (sizeof(_type_) == 4) \
217 else if (sizeof(_type_) == 8) \
220 count = len >> (7 - shifts); \
221 len -= count << (7 - shifts); \
225 __m128i xmm0 = LOAD_SI128(sptr1); \
226 sptr1 += (16 / sizeof(_type_)); \
227 __m128i xmm1 = LOAD_SI128(sptr1); \
228 sptr1 += (16 / sizeof(_type_)); \
229 __m128i xmm2 = LOAD_SI128(sptr1); \
230 sptr1 += (16 / sizeof(_type_)); \
231 __m128i xmm3 = LOAD_SI128(sptr1); \
232 sptr1 += (16 / sizeof(_type_)); \
233 __m128i xmm4 = LOAD_SI128(sptr2); \
234 sptr2 += (16 / sizeof(_type_)); \
235 __m128i xmm5 = LOAD_SI128(sptr2); \
236 sptr2 += (16 / sizeof(_type_)); \
237 __m128i xmm6 = LOAD_SI128(sptr2); \
238 sptr2 += (16 / sizeof(_type_)); \
239 __m128i xmm7 = LOAD_SI128(sptr2); \
240 sptr2 += (16 / sizeof(_type_)); \
241 xmm0 = _op_(xmm0, xmm4); \
242 xmm1 = _op_(xmm1, xmm5); \
243 xmm2 = _op_(xmm2, xmm6); \
244 xmm3 = _op_(xmm3, xmm7); \
245 STORE_SI128(dptr, xmm0); \
246 dptr += (16 / sizeof(_type_)); \
247 STORE_SI128(dptr, xmm1); \
248 dptr += (16 / sizeof(_type_)); \
249 STORE_SI128(dptr, xmm2); \
250 dptr += (16 / sizeof(_type_)); \
251 STORE_SI128(dptr, xmm3); \
252 dptr += (16 / sizeof(_type_)); \
255 count = len >> (5 - shifts); \
256 len -= count << (5 - shifts); \
259 __m128i xmm0 = LOAD_SI128(sptr1); \
260 sptr1 += (16 / sizeof(_type_)); \
261 __m128i xmm1 = LOAD_SI128(sptr2); \
262 sptr2 += (16 / sizeof(_type_)); \
263 xmm0 = _op_(xmm0, xmm1); \
264 STORE_SI128(dptr, xmm0); \
265 dptr += (16 / sizeof(_type_)); \
272 return PRIMITIVES_SUCCESS; \