16#include <freerdp/config.h>
18#include <freerdp/types.h>
19#include <freerdp/primitives.h>
20#include <winpr/sysinfo.h>
22#include "prim_shift.h"
24#include "prim_internal.h"
25#include "prim_templates.h"
27#if defined(SSE_AVX_INTRINSICS_ENABLED)
34SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
35 *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
37SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
38 *dptr++ = *sptr++ >> val)
40SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
41 *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
43SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
44 *dptr++ = *sptr++ >> val)
46static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
49 const INT32 shifts = 2;
51 return PRIMITIVES_SUCCESS;
55 return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
57 UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
58 if ((ULONG_PTR)pSrcDst & offBeatMask)
61 return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
64 const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) /
sizeof(INT16);
67 const UINT32 add = 16 - rem;
68 pstatus_t status =
generic->lShiftC_16s_inplace(pSrcDst, val, add);
69 if (status != PRIMITIVES_SUCCESS)
76 size_t count = len >> (8 - shifts);
77 len -= count << (8 - shifts);
81 const __m128i* src = (
const __m128i*)pSrcDst;
83 __m128i xmm0 = LOAD_SI128(src++);
84 __m128i xmm1 = LOAD_SI128(src++);
85 __m128i xmm2 = LOAD_SI128(src++);
86 __m128i xmm3 = LOAD_SI128(src++);
87 __m128i xmm4 = LOAD_SI128(src++);
88 __m128i xmm5 = LOAD_SI128(src++);
89 __m128i xmm6 = LOAD_SI128(src++);
90 __m128i xmm7 = LOAD_SI128(src);
92 xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
93 xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
94 xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
95 xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
96 xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
97 xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
98 xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
99 xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
101 __m128i* dst = (__m128i*)pSrcDst;
103 STORE_SI128(dst++, xmm0);
104 STORE_SI128(dst++, xmm1);
105 STORE_SI128(dst++, xmm2);
106 STORE_SI128(dst++, xmm3);
107 STORE_SI128(dst++, xmm4);
108 STORE_SI128(dst++, xmm5);
109 STORE_SI128(dst++, xmm6);
110 STORE_SI128(dst++, xmm7);
112 pSrcDst = (INT16*)dst;
116 count = len >> (5 - shifts);
117 len -= count << (5 - shifts);
120 const __m128i* src = (
const __m128i*)pSrcDst;
121 __m128i xmm0 = LOAD_SI128(src);
123 xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
125 __m128i* dst = (__m128i*)pSrcDst;
126 STORE_SI128(dst++, xmm0);
127 pSrcDst = (INT16*)dst;
132 return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
134 return PRIMITIVES_SUCCESS;
144void primitives_init_shift_sse3_int(
primitives_t* WINPR_RESTRICT prims)
146#if defined(SSE_AVX_INTRINSICS_ENABLED)
147 generic = primitives_get_generic();
149 WLog_VRB(PRIM_TAG,
"SSE2/SSE3 optimizations");
150 prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
151 prims->lShiftC_16s = sse2_lShiftC_16s;
152 prims->rShiftC_16s = sse2_rShiftC_16s;
153 prims->lShiftC_16u = sse2_lShiftC_16u;
154 prims->rShiftC_16u = sse2_rShiftC_16u;
157 WLog_VRB(PRIM_TAG,
"undefined WITH_SIMD or SSE3 intrinsics not available");