1 :
2 : /*
3 : * Copyright 2009 The Android Open Source Project
4 : *
5 : * Use of this source code is governed by a BSD-style license that can be
6 : * found in the LICENSE file.
7 : */
8 :
9 :
10 : #include <emmintrin.h>
11 : #include "SkBitmapProcState_opts_SSE2.h"
12 : #include "SkUtils.h"
13 :
14 0 : void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
15 : const uint32_t* xy,
16 : int count, uint32_t* colors) {
17 0 : SkASSERT(count > 0 && colors != NULL);
18 0 : SkASSERT(s.fDoFilter);
19 0 : SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
20 0 : SkASSERT(s.fAlphaScale == 256);
21 :
22 0 : const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
23 0 : unsigned rb = s.fBitmap->rowBytes();
24 0 : uint32_t XY = *xy++;
25 0 : unsigned y0 = XY >> 14;
26 0 : const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
27 0 : const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
28 0 : unsigned subY = y0 & 0xF;
29 :
30 : // ( 0, 0, 0, 0, 0, 0, 0, 16)
31 0 : __m128i sixteen = _mm_cvtsi32_si128(16);
32 :
33 : // ( 0, 0, 0, 0, 16, 16, 16, 16)
34 0 : sixteen = _mm_shufflelo_epi16(sixteen, 0);
35 :
36 : // ( 0, 0, 0, 0, 0, 0, 0, y)
37 0 : __m128i allY = _mm_cvtsi32_si128(subY);
38 :
39 : // ( 0, 0, 0, 0, y, y, y, y)
40 0 : allY = _mm_shufflelo_epi16(allY, 0);
41 :
42 : // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
43 0 : __m128i negY = _mm_sub_epi16(sixteen, allY);
44 :
45 : // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
46 0 : allY = _mm_unpacklo_epi64(allY, negY);
47 :
48 : // (16, 16, 16, 16, 16, 16, 16, 16 )
49 0 : sixteen = _mm_shuffle_epi32(sixteen, 0);
50 :
51 : // ( 0, 0, 0, 0, 0, 0, 0, 0)
52 0 : __m128i zero = _mm_setzero_si128();
53 0 : do {
54 0 : uint32_t XX = *xy++; // x0:14 | 4 | x1:14
55 0 : unsigned x0 = XX >> 18;
56 0 : unsigned x1 = XX & 0x3FFF;
57 :
58 : // (0, 0, 0, 0, 0, 0, 0, x)
59 0 : __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
60 :
61 : // (0, 0, 0, 0, x, x, x, x)
62 0 : allX = _mm_shufflelo_epi16(allX, 0);
63 :
64 : // (x, x, x, x, x, x, x, x)
65 0 : allX = _mm_shuffle_epi32(allX, 0);
66 :
67 : // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
68 0 : __m128i negX = _mm_sub_epi16(sixteen, allX);
69 :
70 : // Load 4 samples (pixels).
71 0 : __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
72 0 : __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
73 0 : __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
74 0 : __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
75 :
76 : // (0, 0, a00, a10)
77 0 : __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
78 :
79 : // Expand to 16 bits per component.
80 0 : a00a10 = _mm_unpacklo_epi8(a00a10, zero);
81 :
82 : // ((a00 * (16-y)), (a10 * y)).
83 0 : a00a10 = _mm_mullo_epi16(a00a10, allY);
84 :
85 : // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
86 0 : a00a10 = _mm_mullo_epi16(a00a10, negX);
87 :
88 : // (0, 0, a01, a10)
89 0 : __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
90 :
91 : // Expand to 16 bits per component.
92 0 : a01a11 = _mm_unpacklo_epi8(a01a11, zero);
93 :
94 : // (a01 * (16-y)), (a11 * y)
95 0 : a01a11 = _mm_mullo_epi16(a01a11, allY);
96 :
97 : // (a01 * (16-y) * x), (a11 * y * x)
98 0 : a01a11 = _mm_mullo_epi16(a01a11, allX);
99 :
100 : // (a00*w00 + a01*w01, a10*w10 + a11*w11)
101 0 : __m128i sum = _mm_add_epi16(a00a10, a01a11);
102 :
103 : // (DC, a00*w00 + a01*w01)
104 0 : __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
105 :
106 : // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
107 0 : sum = _mm_add_epi16(sum, shifted);
108 :
109 : // Divide each 16 bit component by 256.
110 0 : sum = _mm_srli_epi16(sum, 8);
111 :
112 : // Pack lower 4 16 bit values of sum into lower 4 bytes.
113 0 : sum = _mm_packus_epi16(sum, zero);
114 :
115 : // Extract low int and store.
116 0 : *colors++ = _mm_cvtsi128_si32(sum);
117 : } while (--count > 0);
118 0 : }
119 :
120 0 : void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
121 : const uint32_t* xy,
122 : int count, uint32_t* colors) {
123 0 : SkASSERT(count > 0 && colors != NULL);
124 0 : SkASSERT(s.fDoFilter);
125 0 : SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
126 0 : SkASSERT(s.fAlphaScale < 256);
127 :
128 0 : const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
129 0 : unsigned rb = s.fBitmap->rowBytes();
130 0 : uint32_t XY = *xy++;
131 0 : unsigned y0 = XY >> 14;
132 0 : const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
133 0 : const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
134 0 : unsigned subY = y0 & 0xF;
135 :
136 : // ( 0, 0, 0, 0, 0, 0, 0, 16)
137 0 : __m128i sixteen = _mm_cvtsi32_si128(16);
138 :
139 : // ( 0, 0, 0, 0, 16, 16, 16, 16)
140 0 : sixteen = _mm_shufflelo_epi16(sixteen, 0);
141 :
142 : // ( 0, 0, 0, 0, 0, 0, 0, y)
143 0 : __m128i allY = _mm_cvtsi32_si128(subY);
144 :
145 : // ( 0, 0, 0, 0, y, y, y, y)
146 0 : allY = _mm_shufflelo_epi16(allY, 0);
147 :
148 : // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
149 0 : __m128i negY = _mm_sub_epi16(sixteen, allY);
150 :
151 : // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
152 0 : allY = _mm_unpacklo_epi64(allY, negY);
153 :
154 : // (16, 16, 16, 16, 16, 16, 16, 16 )
155 0 : sixteen = _mm_shuffle_epi32(sixteen, 0);
156 :
157 : // ( 0, 0, 0, 0, 0, 0, 0, 0)
158 0 : __m128i zero = _mm_setzero_si128();
159 :
160 : // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
161 0 : __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
162 :
163 0 : do {
164 0 : uint32_t XX = *xy++; // x0:14 | 4 | x1:14
165 0 : unsigned x0 = XX >> 18;
166 0 : unsigned x1 = XX & 0x3FFF;
167 :
168 : // (0, 0, 0, 0, 0, 0, 0, x)
169 0 : __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
170 :
171 : // (0, 0, 0, 0, x, x, x, x)
172 0 : allX = _mm_shufflelo_epi16(allX, 0);
173 :
174 : // (x, x, x, x, x, x, x, x)
175 0 : allX = _mm_shuffle_epi32(allX, 0);
176 :
177 : // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
178 0 : __m128i negX = _mm_sub_epi16(sixteen, allX);
179 :
180 : // Load 4 samples (pixels).
181 0 : __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
182 0 : __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
183 0 : __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
184 0 : __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
185 :
186 : // (0, 0, a00, a10)
187 0 : __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
188 :
189 : // Expand to 16 bits per component.
190 0 : a00a10 = _mm_unpacklo_epi8(a00a10, zero);
191 :
192 : // ((a00 * (16-y)), (a10 * y)).
193 0 : a00a10 = _mm_mullo_epi16(a00a10, allY);
194 :
195 : // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
196 0 : a00a10 = _mm_mullo_epi16(a00a10, negX);
197 :
198 : // (0, 0, a01, a10)
199 0 : __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
200 :
201 : // Expand to 16 bits per component.
202 0 : a01a11 = _mm_unpacklo_epi8(a01a11, zero);
203 :
204 : // (a01 * (16-y)), (a11 * y)
205 0 : a01a11 = _mm_mullo_epi16(a01a11, allY);
206 :
207 : // (a01 * (16-y) * x), (a11 * y * x)
208 0 : a01a11 = _mm_mullo_epi16(a01a11, allX);
209 :
210 : // (a00*w00 + a01*w01, a10*w10 + a11*w11)
211 0 : __m128i sum = _mm_add_epi16(a00a10, a01a11);
212 :
213 : // (DC, a00*w00 + a01*w01)
214 0 : __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
215 :
216 : // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
217 0 : sum = _mm_add_epi16(sum, shifted);
218 :
219 : // Divide each 16 bit component by 256.
220 0 : sum = _mm_srli_epi16(sum, 8);
221 :
222 : // Multiply by alpha.
223 0 : sum = _mm_mullo_epi16(sum, alpha);
224 :
225 : // Divide each 16 bit component by 256.
226 0 : sum = _mm_srli_epi16(sum, 8);
227 :
228 : // Pack lower 4 16 bit values of sum into lower 4 bytes.
229 0 : sum = _mm_packus_epi16(sum, zero);
230 :
231 : // Extract low int and store.
232 0 : *colors++ = _mm_cvtsi128_si32(sum);
233 : } while (--count > 0);
234 0 : }
|