FreeRDP
Loading...
Searching...
No Matches
prim_set_sse2.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized routines to set a chunk of memory to a constant.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 *
15 */
16
17#include <freerdp/config.h>
18
19#include <string.h>
20#include <freerdp/types.h>
21#include <freerdp/primitives.h>
22#include <winpr/sysinfo.h>
23
24#include "prim_internal.h"
25#include "prim_avxsse.h"
26#include "prim_set.h"
27
28/* ========================================================================= */
29#if defined(SSE_AVX_INTRINSICS_ENABLED)
30#include <emmintrin.h>
31
32static primitives_t* generic = NULL;
33
34static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 ulen)
35{
36 size_t len = ulen;
37 BYTE byte = 0;
38 BYTE* dptr = NULL;
39 __m128i xmm0;
40 size_t count = 0;
41
42 if (len < 16)
43 return generic->set_8u(val, pDst, ulen);
44
45 byte = val;
46 dptr = pDst;
47
48 /* Seek 16-byte alignment. */
49 while ((ULONG_PTR)dptr & 0x0f)
50 {
51 *dptr++ = byte;
52
53 if (--len == 0)
54 return PRIMITIVES_SUCCESS;
55 }
56
57 xmm0 = mm_set1_epu8(byte);
58 /* Cover 256-byte chunks via SSE register stores. */
59 count = len >> 8;
60 len -= count << 8;
61
62 /* Do 256-byte chunks using one XMM register. */
63 while (count--)
64 {
65 STORE_SI128(dptr, xmm0);
66 dptr += 16;
67 STORE_SI128(dptr, xmm0);
68 dptr += 16;
69 STORE_SI128(dptr, xmm0);
70 dptr += 16;
71 STORE_SI128(dptr, xmm0);
72 dptr += 16;
73 STORE_SI128(dptr, xmm0);
74 dptr += 16;
75 STORE_SI128(dptr, xmm0);
76 dptr += 16;
77 STORE_SI128(dptr, xmm0);
78 dptr += 16;
79 STORE_SI128(dptr, xmm0);
80 dptr += 16;
81 STORE_SI128(dptr, xmm0);
82 dptr += 16;
83 STORE_SI128(dptr, xmm0);
84 dptr += 16;
85 STORE_SI128(dptr, xmm0);
86 dptr += 16;
87 STORE_SI128(dptr, xmm0);
88 dptr += 16;
89 STORE_SI128(dptr, xmm0);
90 dptr += 16;
91 STORE_SI128(dptr, xmm0);
92 dptr += 16;
93 STORE_SI128(dptr, xmm0);
94 dptr += 16;
95 STORE_SI128(dptr, xmm0);
96 dptr += 16;
97 }
98
99 /* Cover 16-byte chunks via SSE register stores. */
100 count = len >> 4;
101 len -= count << 4;
102
103 /* Do 16-byte chunks using one XMM register. */
104 while (count--)
105 {
106 STORE_SI128(dptr, xmm0);
107 dptr += 16;
108 }
109
110 /* Do leftover bytes. */
111 while (len--)
112 *dptr++ = byte;
113
114 return PRIMITIVES_SUCCESS;
115}
116
117/* ------------------------------------------------------------------------- */
118static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 ulen)
119{
120 size_t len = ulen;
121 const primitives_t* prim = primitives_get_generic();
122 UINT32* dptr = pDst;
123 __m128i xmm0;
124 size_t count = 0;
125
126 /* If really short, just do it here. */
127 if (len < 32)
128 {
129 while (len--)
130 *dptr++ = val;
131
132 return PRIMITIVES_SUCCESS;
133 }
134
135 /* Assure we can reach 16-byte alignment. */
136 if (((ULONG_PTR)dptr & 0x03) != 0)
137 {
138 return prim->set_32u(val, pDst, ulen);
139 }
140
141 /* Seek 16-byte alignment. */
142 while ((ULONG_PTR)dptr & 0x0f)
143 {
144 *dptr++ = val;
145
146 if (--len == 0)
147 return PRIMITIVES_SUCCESS;
148 }
149
150 xmm0 = mm_set1_epu32(val);
151 /* Cover 256-byte chunks via SSE register stores. */
152 count = len >> 6;
153 len -= count << 6;
154
155 /* Do 256-byte chunks using one XMM register. */
156 while (count--)
157 {
158 STORE_SI128(dptr, xmm0);
159 dptr += 4;
160 STORE_SI128(dptr, xmm0);
161 dptr += 4;
162 STORE_SI128(dptr, xmm0);
163 dptr += 4;
164 STORE_SI128(dptr, xmm0);
165 dptr += 4;
166 STORE_SI128(dptr, xmm0);
167 dptr += 4;
168 STORE_SI128(dptr, xmm0);
169 dptr += 4;
170 STORE_SI128(dptr, xmm0);
171 dptr += 4;
172 STORE_SI128(dptr, xmm0);
173 dptr += 4;
174 STORE_SI128(dptr, xmm0);
175 dptr += 4;
176 STORE_SI128(dptr, xmm0);
177 dptr += 4;
178 STORE_SI128(dptr, xmm0);
179 dptr += 4;
180 STORE_SI128(dptr, xmm0);
181 dptr += 4;
182 STORE_SI128(dptr, xmm0);
183 dptr += 4;
184 STORE_SI128(dptr, xmm0);
185 dptr += 4;
186 STORE_SI128(dptr, xmm0);
187 dptr += 4;
188 STORE_SI128(dptr, xmm0);
189 dptr += 4;
190 }
191
192 /* Cover 16-byte chunks via SSE register stores. */
193 count = len >> 2;
194 len -= count << 2;
195
196 /* Do 16-byte chunks using one XMM register. */
197 while (count--)
198 {
199 STORE_SI128(dptr, xmm0);
200 dptr += 4;
201 }
202
203 /* Do leftover bytes. */
204 while (len--)
205 *dptr++ = val;
206
207 return PRIMITIVES_SUCCESS;
208}
209
210/* ------------------------------------------------------------------------- */
211static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
212{
213 UINT32 uval = *((UINT32*)&val);
214 return sse2_set_32u(uval, (UINT32*)pDst, len);
215}
216#endif
217
218/* ------------------------------------------------------------------------- */
219void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims)
220{
221#if defined(SSE_AVX_INTRINSICS_ENABLED)
222 generic = primitives_get_generic();
223
224 /* Pick tuned versions if possible. */
225
226 WLog_VRB(PRIM_TAG, "SSE2 optimizations");
227 prims->set_8u = sse2_set_8u;
228 prims->set_32s = sse2_set_32s;
229 prims->set_32u = sse2_set_32u;
230
231#else
232 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available");
233 WINPR_UNUSED(prims);
234#endif
235}