FreeRDP
Loading...
Searching...
No Matches
prim_add_sse3.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized add operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 *
15 */
16
17#include <freerdp/config.h>
18
19#include <freerdp/types.h>
20#include <freerdp/primitives.h>
21#include <winpr/sysinfo.h>
22
23#include "prim_add.h"
24
25#include "prim_internal.h"
26#include "prim_templates.h"
27
28#if defined(SSE_AVX_INTRINSICS_ENABLED)
29#include <emmintrin.h>
30#include <pmmintrin.h>
31
32static primitives_t* generic = NULL;
33
34/* ------------------------------------------------------------------------- */
35SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
36 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
37
38static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
39 INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
40{
41 const int shifts = 2;
42 INT16* dptr1 = pSrcDst1;
43 INT16* dptr2 = pSrcDst2;
44
45 if (ulen < 16) /* pointless if too small */
46 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
47
48 UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
49 if ((ULONG_PTR)pSrcDst1 & offBeatMask)
50 {
51 /* Incrementing the pointer skips over 16-byte boundary. */
52 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
53 }
54 /* Get to the 16-byte boundary now. */
55 const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16);
56 if (rem != 0)
57 {
58 const UINT32 add = 16 - (UINT32)rem;
59 pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add);
60 if (status != PRIMITIVES_SUCCESS)
61 return status;
62 dptr1 += add;
63 dptr2 += add;
64 }
65 /* Use 4 128-bit SSE registers. */
66 size_t len = ulen;
67 size_t count = len >> (7 - shifts);
68 len -= count << (7 - shifts);
69 if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f))
70 {
71 /* Unaligned loads */
72 while (count--)
73 {
74 const __m128i* vsptr1 = (const __m128i*)dptr1;
75 const __m128i* vsptr2 = (const __m128i*)dptr2;
76 __m128i* vdptr1 = (__m128i*)dptr1;
77 __m128i* vdptr2 = (__m128i*)dptr2;
78
79 __m128i xmm0 = LOAD_SI128(vsptr1++);
80 __m128i xmm1 = LOAD_SI128(vsptr1++);
81 __m128i xmm2 = LOAD_SI128(vsptr1++);
82 __m128i xmm3 = LOAD_SI128(vsptr1++);
83 __m128i xmm4 = LOAD_SI128(vsptr2++);
84 __m128i xmm5 = LOAD_SI128(vsptr2++);
85 __m128i xmm6 = LOAD_SI128(vsptr2++);
86 __m128i xmm7 = LOAD_SI128(vsptr2++);
87
88 xmm0 = _mm_adds_epi16(xmm0, xmm4);
89 xmm1 = _mm_adds_epi16(xmm1, xmm5);
90 xmm2 = _mm_adds_epi16(xmm2, xmm6);
91 xmm3 = _mm_adds_epi16(xmm3, xmm7);
92
93 STORE_SI128(vdptr1++, xmm0);
94 STORE_SI128(vdptr1++, xmm1);
95 STORE_SI128(vdptr1++, xmm2);
96 STORE_SI128(vdptr1++, xmm3);
97
98 STORE_SI128(vdptr2++, xmm0);
99 STORE_SI128(vdptr2++, xmm1);
100 STORE_SI128(vdptr2++, xmm2);
101 STORE_SI128(vdptr2++, xmm3);
102
103 dptr1 = (INT16*)vdptr1;
104 dptr2 = (INT16*)vdptr2;
105 }
106 }
107 else
108 {
109 /* Aligned loads */
110 while (count--)
111 {
112 const __m128i* vsptr1 = (const __m128i*)dptr1;
113 const __m128i* vsptr2 = (const __m128i*)dptr2;
114 __m128i* vdptr1 = (__m128i*)dptr1;
115 __m128i* vdptr2 = (__m128i*)dptr2;
116
117 __m128i xmm0 = LOAD_SI128(vsptr1++);
118 __m128i xmm1 = LOAD_SI128(vsptr1++);
119 __m128i xmm2 = LOAD_SI128(vsptr1++);
120 __m128i xmm3 = LOAD_SI128(vsptr1++);
121 __m128i xmm4 = LOAD_SI128(vsptr2++);
122 __m128i xmm5 = LOAD_SI128(vsptr2++);
123 __m128i xmm6 = LOAD_SI128(vsptr2++);
124 __m128i xmm7 = LOAD_SI128(vsptr2++);
125
126 xmm0 = _mm_adds_epi16(xmm0, xmm4);
127 xmm1 = _mm_adds_epi16(xmm1, xmm5);
128 xmm2 = _mm_adds_epi16(xmm2, xmm6);
129 xmm3 = _mm_adds_epi16(xmm3, xmm7);
130
131 STORE_SI128(vdptr1++, xmm0);
132 STORE_SI128(vdptr1++, xmm1);
133 STORE_SI128(vdptr1++, xmm2);
134 STORE_SI128(vdptr1++, xmm3);
135
136 STORE_SI128(vdptr2++, xmm0);
137 STORE_SI128(vdptr2++, xmm1);
138 STORE_SI128(vdptr2++, xmm2);
139 STORE_SI128(vdptr2++, xmm3);
140
141 dptr1 = (INT16*)vdptr1;
142 dptr2 = (INT16*)vdptr2;
143 }
144 }
145 /* Use a single 128-bit SSE register. */
146 count = len >> (5 - shifts);
147 len -= count << (5 - shifts);
148 while (count--)
149 {
150 const __m128i* vsptr1 = (const __m128i*)dptr1;
151 const __m128i* vsptr2 = (const __m128i*)dptr2;
152 __m128i* vdptr1 = (__m128i*)dptr1;
153 __m128i* vdptr2 = (__m128i*)dptr2;
154
155 __m128i xmm0 = LOAD_SI128(vsptr1);
156 __m128i xmm1 = LOAD_SI128(vsptr2);
157
158 xmm0 = _mm_adds_epi16(xmm0, xmm1);
159
160 STORE_SI128(vdptr1++, xmm0);
161 STORE_SI128(vdptr2++, xmm0);
162
163 dptr1 = (INT16*)vdptr1;
164 dptr2 = (INT16*)vdptr2;
165 }
166 /* Finish off the remainder. */
167 if (len > 0)
168 return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
169
170 return PRIMITIVES_SUCCESS;
171}
172#endif
173
174/* ------------------------------------------------------------------------- */
175void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims)
176{
177#if defined(SSE_AVX_INTRINSICS_ENABLED)
178 generic = primitives_get_generic();
179
180 WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
181 prims->add_16s = sse3_add_16s;
182 prims->add_16s_inplace = sse3_add_16s_inplace;
183#else
184 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
185 WINPR_UNUSED(prims);
186#endif
187}