17#include <freerdp/config.h>
19#include <freerdp/types.h>
20#include <freerdp/primitives.h>
21#include <winpr/sysinfo.h>
25#include "prim_internal.h"
26#include "prim_templates.h"
28#if defined(SSE_AVX_INTRINSICS_ENABLED)
35SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
36 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
38static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
39 INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen)
42 INT16* dptr1 = pSrcDst1;
43 INT16* dptr2 = pSrcDst2;
46 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
48 UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
49 if ((ULONG_PTR)pSrcDst1 & offBeatMask)
52 return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen);
55 const size_t rem = ((UINT_PTR)dptr1 & 0xf) /
sizeof(INT16);
58 const UINT32 add = 16 - (UINT32)rem;
59 pstatus_t status =
generic->add_16s_inplace(dptr1, dptr2, add);
60 if (status != PRIMITIVES_SUCCESS)
67 size_t count = len >> (7 - shifts);
68 len -= count << (7 - shifts);
69 if (((
const ULONG_PTR)dptr1 & 0x0f) || ((
const ULONG_PTR)dptr2 & 0x0f))
74 const __m128i* vsptr1 = (
const __m128i*)dptr1;
75 const __m128i* vsptr2 = (
const __m128i*)dptr2;
76 __m128i* vdptr1 = (__m128i*)dptr1;
77 __m128i* vdptr2 = (__m128i*)dptr2;
79 __m128i xmm0 = LOAD_SI128(vsptr1++);
80 __m128i xmm1 = LOAD_SI128(vsptr1++);
81 __m128i xmm2 = LOAD_SI128(vsptr1++);
82 __m128i xmm3 = LOAD_SI128(vsptr1++);
83 __m128i xmm4 = LOAD_SI128(vsptr2++);
84 __m128i xmm5 = LOAD_SI128(vsptr2++);
85 __m128i xmm6 = LOAD_SI128(vsptr2++);
86 __m128i xmm7 = LOAD_SI128(vsptr2++);
88 xmm0 = _mm_adds_epi16(xmm0, xmm4);
89 xmm1 = _mm_adds_epi16(xmm1, xmm5);
90 xmm2 = _mm_adds_epi16(xmm2, xmm6);
91 xmm3 = _mm_adds_epi16(xmm3, xmm7);
93 STORE_SI128(vdptr1++, xmm0);
94 STORE_SI128(vdptr1++, xmm1);
95 STORE_SI128(vdptr1++, xmm2);
96 STORE_SI128(vdptr1++, xmm3);
98 STORE_SI128(vdptr2++, xmm0);
99 STORE_SI128(vdptr2++, xmm1);
100 STORE_SI128(vdptr2++, xmm2);
101 STORE_SI128(vdptr2++, xmm3);
103 dptr1 = (INT16*)vdptr1;
104 dptr2 = (INT16*)vdptr2;
112 const __m128i* vsptr1 = (
const __m128i*)dptr1;
113 const __m128i* vsptr2 = (
const __m128i*)dptr2;
114 __m128i* vdptr1 = (__m128i*)dptr1;
115 __m128i* vdptr2 = (__m128i*)dptr2;
117 __m128i xmm0 = LOAD_SI128(vsptr1++);
118 __m128i xmm1 = LOAD_SI128(vsptr1++);
119 __m128i xmm2 = LOAD_SI128(vsptr1++);
120 __m128i xmm3 = LOAD_SI128(vsptr1++);
121 __m128i xmm4 = LOAD_SI128(vsptr2++);
122 __m128i xmm5 = LOAD_SI128(vsptr2++);
123 __m128i xmm6 = LOAD_SI128(vsptr2++);
124 __m128i xmm7 = LOAD_SI128(vsptr2++);
126 xmm0 = _mm_adds_epi16(xmm0, xmm4);
127 xmm1 = _mm_adds_epi16(xmm1, xmm5);
128 xmm2 = _mm_adds_epi16(xmm2, xmm6);
129 xmm3 = _mm_adds_epi16(xmm3, xmm7);
131 STORE_SI128(vdptr1++, xmm0);
132 STORE_SI128(vdptr1++, xmm1);
133 STORE_SI128(vdptr1++, xmm2);
134 STORE_SI128(vdptr1++, xmm3);
136 STORE_SI128(vdptr2++, xmm0);
137 STORE_SI128(vdptr2++, xmm1);
138 STORE_SI128(vdptr2++, xmm2);
139 STORE_SI128(vdptr2++, xmm3);
141 dptr1 = (INT16*)vdptr1;
142 dptr2 = (INT16*)vdptr2;
146 count = len >> (5 - shifts);
147 len -= count << (5 - shifts);
150 const __m128i* vsptr1 = (
const __m128i*)dptr1;
151 const __m128i* vsptr2 = (
const __m128i*)dptr2;
152 __m128i* vdptr1 = (__m128i*)dptr1;
153 __m128i* vdptr2 = (__m128i*)dptr2;
155 __m128i xmm0 = LOAD_SI128(vsptr1);
156 __m128i xmm1 = LOAD_SI128(vsptr2);
158 xmm0 = _mm_adds_epi16(xmm0, xmm1);
160 STORE_SI128(vdptr1++, xmm0);
161 STORE_SI128(vdptr2++, xmm0);
163 dptr1 = (INT16*)vdptr1;
164 dptr2 = (INT16*)vdptr2;
168 return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len));
170 return PRIMITIVES_SUCCESS;
175void primitives_init_add_sse3_int(
primitives_t* WINPR_RESTRICT prims)
177#if defined(SSE_AVX_INTRINSICS_ENABLED)
178 generic = primitives_get_generic();
180 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
181 prims->add_16s = sse3_add_16s;
182 prims->add_16s_inplace = sse3_add_16s_inplace;
184 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE3 intrinsics not available");