FreeRDP
Loading...
Searching...
No Matches
prim_alphaComp_sse3.c
1/* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized alpha blending routines.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 *
15 * Note: this code assumes the second operand is fully opaque,
16 * e.g.
17 * newval = alpha1*val1 + (1-alpha1)*val2
18 * rather than
19 * newval = alpha1*val1 + (1-alpha1)*alpha2*val2
20 * The IPP gives other options.
21 */
22
23#include <freerdp/config.h>
24
25#include <freerdp/types.h>
26#include <freerdp/primitives.h>
27#include <winpr/sysinfo.h>
28
29#include "prim_alphaComp.h"
30
31#include "prim_internal.h"
32#include "prim_avxsse.h"
33
34/* ------------------------------------------------------------------------- */
35#if defined(SSE_AVX_INTRINSICS_ENABLED)
36#include <emmintrin.h>
37#include <pmmintrin.h>
38
39static primitives_t* generic = NULL;
40
41static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
42 const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
43 BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
44 UINT32 height)
45{
46 const UINT32* sptr1 = (const UINT32*)pSrc1;
47 const UINT32* sptr2 = (const UINT32*)pSrc2;
48
49 if ((width <= 0) || (height <= 0))
50 return PRIMITIVES_SUCCESS;
51
52 if (width < 4) /* pointless if too small */
53 {
54 return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
55 height);
56 }
57
58 UINT32* dptr = (UINT32*)pDst;
59 const size_t linebytes = width * sizeof(UINT32);
60 const size_t src1Jump = (src1Step - linebytes) / sizeof(UINT32);
61 const size_t src2Jump = (src2Step - linebytes) / sizeof(UINT32);
62 const size_t dstJump = (dstStep - linebytes) / sizeof(UINT32);
63 __m128i xmm0 = mm_set1_epu32(0);
64 __m128i xmm1 = _mm_set1_epi16(1);
65
66 for (UINT32 y = 0; y < height; ++y)
67 {
68 uint32_t pixels = width;
69 uint32_t count = 0;
70 /* Get to the 16-byte boundary now. */
71 uint32_t leadIn = 0;
72
73 switch ((ULONG_PTR)dptr & 0x0f)
74 {
75 case 0:
76 leadIn = 0;
77 break;
78
79 case 4:
80 leadIn = 3;
81 break;
82
83 case 8:
84 leadIn = 2;
85 break;
86
87 case 12:
88 leadIn = 1;
89 break;
90
91 default:
92 /* We'll never hit a 16-byte boundary, so do the whole
93 * thing the slow way.
94 */
95 leadIn = width;
96 break;
97 }
98
99 if (leadIn)
100 {
101 pstatus_t status = 0;
102 status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
103 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
104 if (status != PRIMITIVES_SUCCESS)
105 return status;
106
107 sptr1 += leadIn;
108 sptr2 += leadIn;
109 dptr += leadIn;
110 pixels -= leadIn;
111 }
112
113 /* Use SSE registers to do 4 pixels at a time. */
114 count = pixels >> 2;
115 pixels -= count << 2;
116
117 while (count--)
118 {
119 __m128i xmm2;
120 __m128i xmm3;
121 __m128i xmm4;
122 __m128i xmm5;
123 __m128i xmm6;
124 __m128i xmm7;
125 /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
126 xmm2 = LOAD_SI128(sptr1);
127 sptr1 += 4;
128 /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
129 xmm3 = LOAD_SI128(sptr2);
130 sptr2 += 4;
131 /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
132 xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
133 /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
134 xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
135 /* subtract */
136 xmm6 = _mm_subs_epi16(xmm4, xmm5);
137 /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
138 xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
139 /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
140 xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
141 /* Add one to alphas */
142 xmm4 = _mm_adds_epi16(xmm4, xmm1);
143 /* Multiply and take low word */
144 xmm4 = _mm_mullo_epi16(xmm4, xmm6);
145 /* Shift 8 right */
146 xmm4 = _mm_srai_epi16(xmm4, 8);
147 /* Add xmm5 */
148 xmm4 = _mm_adds_epi16(xmm4, xmm5);
149 /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
150 /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
151 xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
152 /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
153 xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
154 /* subtract */
155 xmm7 = _mm_subs_epi16(xmm5, xmm6);
156 /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
157 xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
158 /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
159 xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
160 /* Add one to alphas */
161 xmm5 = _mm_adds_epi16(xmm5, xmm1);
162 /* Multiply and take low word */
163 xmm5 = _mm_mullo_epi16(xmm5, xmm7);
164 /* Shift 8 right */
165 xmm5 = _mm_srai_epi16(xmm5, 8);
166 /* Add xmm6 */
167 xmm5 = _mm_adds_epi16(xmm5, xmm6);
168 /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
169 /* Must mask off remainders or pack gets confused */
170 xmm3 = _mm_set1_epi16(0x00ffU);
171 xmm4 = _mm_and_si128(xmm4, xmm3);
172 xmm5 = _mm_and_si128(xmm5, xmm3);
173 /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
174 xmm5 = _mm_packus_epi16(xmm5, xmm4);
175 STORE_SI128(dptr, xmm5);
176 dptr += 4;
177 }
178
179 /* Finish off the remainder. */
180 if (pixels)
181 {
182 pstatus_t status = 0;
183 status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
184 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
185 if (status != PRIMITIVES_SUCCESS)
186 return status;
187
188 sptr1 += pixels;
189 sptr2 += pixels;
190 dptr += pixels;
191 }
192
193 /* Jump to next row. */
194 sptr1 += src1Jump;
195 sptr2 += src2Jump;
196 dptr += dstJump;
197 }
198
199 return PRIMITIVES_SUCCESS;
200}
201#endif
202
203/* ------------------------------------------------------------------------- */
204void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims)
205{
206#if defined(SSE_AVX_INTRINSICS_ENABLED)
207 generic = primitives_get_generic();
208 WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
209 prims->alphaComp_argb = sse2_alphaComp_argb;
210
211#else
212 WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
213 WINPR_UNUSED(prims);
214#endif
215}