FreeRDP
Loading...
Searching...
No Matches
prim_sign_ssse3.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized sign operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 */
15
16#include <freerdp/config.h>
17
18#include <freerdp/types.h>
19#include <freerdp/primitives.h>
20#include <winpr/sysinfo.h>
21
22#include "prim_sign.h"
23
24#include "prim_internal.h"
25#include "prim_avxsse.h"
26
27#if defined(SSE_AVX_INTRINSICS_ENABLED)
28#include <emmintrin.h>
29#include <tmmintrin.h>
30
31static primitives_t* generic = NULL;
32
33/* ------------------------------------------------------------------------- */
34static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
35 UINT32 ulen)
36{
37 size_t len = ulen;
38 const INT16* sptr = pSrc;
39 INT16* dptr = pDst;
40 size_t count = 0;
41
42 if (len < 16)
43 {
44 return generic->sign_16s(pSrc, pDst, ulen);
45 }
46
47 /* Check for 16-byte alignment (eventually). */
48 if ((ULONG_PTR)pDst & 0x01)
49 {
50 return generic->sign_16s(pSrc, pDst, ulen);
51 }
52
53 /* Seek 16-byte alignment. */
54 while ((ULONG_PTR)dptr & 0x0f)
55 {
56 INT16 src = *sptr++;
57 *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
58
59 if (--len == 0)
60 return PRIMITIVES_SUCCESS;
61 }
62
63 /* Do 32-short chunks using 8 XMM registers. */
64 count = len >> 5; /* / 32 */
65 len -= count << 5; /* * 32 */
66
67 if ((ULONG_PTR)sptr & 0x0f)
68 {
69 /* Unaligned */
70 while (count--)
71 {
72 __m128i xmm0;
73 __m128i xmm1;
74 __m128i xmm2;
75 __m128i xmm3;
76 __m128i xmm4;
77 __m128i xmm5;
78 __m128i xmm6;
79 __m128i xmm7;
80 xmm0 = _mm_set1_epi16(0x0001U);
81 xmm1 = _mm_set1_epi16(0x0001U);
82 xmm2 = _mm_set1_epi16(0x0001U);
83 xmm3 = _mm_set1_epi16(0x0001U);
84 xmm4 = LOAD_SI128(sptr);
85 sptr += 8;
86 xmm5 = LOAD_SI128(sptr);
87 sptr += 8;
88 xmm6 = LOAD_SI128(sptr);
89 sptr += 8;
90 xmm7 = LOAD_SI128(sptr);
91 sptr += 8;
92 xmm0 = _mm_sign_epi16(xmm0, xmm4);
93 xmm1 = _mm_sign_epi16(xmm1, xmm5);
94 xmm2 = _mm_sign_epi16(xmm2, xmm6);
95 xmm3 = _mm_sign_epi16(xmm3, xmm7);
96 STORE_SI128(dptr, xmm0);
97 dptr += 8;
98 STORE_SI128(dptr, xmm1);
99 dptr += 8;
100 STORE_SI128(dptr, xmm2);
101 dptr += 8;
102 STORE_SI128(dptr, xmm3);
103 dptr += 8;
104 }
105 }
106 else
107 {
108 /* Aligned */
109 while (count--)
110 {
111 __m128i xmm0;
112 __m128i xmm1;
113 __m128i xmm2;
114 __m128i xmm3;
115 __m128i xmm4;
116 __m128i xmm5;
117 __m128i xmm6;
118 __m128i xmm7;
119 xmm0 = _mm_set1_epi16(0x0001U);
120 xmm1 = _mm_set1_epi16(0x0001U);
121 xmm2 = _mm_set1_epi16(0x0001U);
122 xmm3 = _mm_set1_epi16(0x0001U);
123 xmm4 = LOAD_SI128(sptr);
124 sptr += 8;
125 xmm5 = LOAD_SI128(sptr);
126 sptr += 8;
127 xmm6 = LOAD_SI128(sptr);
128 sptr += 8;
129 xmm7 = LOAD_SI128(sptr);
130 sptr += 8;
131 xmm0 = _mm_sign_epi16(xmm0, xmm4);
132 xmm1 = _mm_sign_epi16(xmm1, xmm5);
133 xmm2 = _mm_sign_epi16(xmm2, xmm6);
134 xmm3 = _mm_sign_epi16(xmm3, xmm7);
135 STORE_SI128(dptr, xmm0);
136 dptr += 8;
137 STORE_SI128(dptr, xmm1);
138 dptr += 8;
139 STORE_SI128(dptr, xmm2);
140 dptr += 8;
141 STORE_SI128(dptr, xmm3);
142 dptr += 8;
143 }
144 }
145
146 /* Do 8-short chunks using two XMM registers. */
147 count = len >> 3;
148 len -= count << 3;
149
150 while (count--)
151 {
152 __m128i xmm0 = _mm_set1_epi16(0x0001U);
153 __m128i xmm1 = LOAD_SI128(sptr);
154 sptr += 8;
155 xmm0 = _mm_sign_epi16(xmm0, xmm1);
156 STORE_SI128(dptr, xmm0);
157 dptr += 8;
158 }
159
160 /* Do leftovers. */
161 while (len--)
162 {
163 INT16 src = *sptr++;
164 *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
165 }
166
167 return PRIMITIVES_SUCCESS;
168}
169
170#endif /* SSE_AVX_INTRINSICS_ENABLED */
171
172/* ------------------------------------------------------------------------- */
173void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
174{
175#if defined(SSE_AVX_INTRINSICS_ENABLED)
176 generic = primitives_get_generic();
177
178 /* Pick tuned versions if possible. */
179 /* I didn't spot an IPP version of this. */
180
181 WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
182 prims->sign_16s = ssse3_sign_16s;
183
184#else
185 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
186 WINPR_UNUSED(prims);
187#endif
188}