1 :
2 : /*
3 : * Copyright 2009 The Android Open Source Project
4 : *
5 : * Use of this source code is governed by a BSD-style license that can be
6 : * found in the LICENSE file.
7 : */
8 :
9 :
10 : #include "SkBlitRow_opts_SSE2.h"
11 : #include "SkColorPriv.h"
12 : #include "SkUtils.h"
13 :
14 : #include <emmintrin.h>
15 :
16 : /* SSE2 version of S32_Blend_BlitRow32()
17 : * portable version is in core/SkBlitRow_D32.cpp
18 : */
19 0 : void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 : const SkPMColor* SK_RESTRICT src,
21 : int count, U8CPU alpha) {
22 0 : SkASSERT(alpha <= 255);
23 0 : if (count <= 0) {
24 0 : return;
25 : }
26 :
27 0 : uint32_t src_scale = SkAlpha255To256(alpha);
28 0 : uint32_t dst_scale = 256 - src_scale;
29 :
30 0 : if (count >= 4) {
31 0 : SkASSERT(((size_t)dst & 0x03) == 0);
32 0 : while (((size_t)dst & 0x0F) != 0) {
33 0 : *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 0 : src++;
35 0 : dst++;
36 0 : count--;
37 : }
38 :
39 0 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
41 0 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 0 : __m128i src_scale_wide = _mm_set1_epi16(src_scale);
43 0 : __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
44 0 : while (count >= 4) {
45 : // Load 4 pixels each of src and dest.
46 0 : __m128i src_pixel = _mm_loadu_si128(s);
47 0 : __m128i dst_pixel = _mm_load_si128(d);
48 :
49 : // Get red and blue pixels into lower byte of each word.
50 0 : __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
51 0 : __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
52 :
53 : // Get alpha and green into lower byte of each word.
54 0 : __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
55 0 : __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
56 :
57 : // Multiply by scale.
58 0 : src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
59 0 : src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
60 0 : dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
61 0 : dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
62 :
63 : // Divide by 256.
64 0 : src_rb = _mm_srli_epi16(src_rb, 8);
65 0 : dst_rb = _mm_srli_epi16(dst_rb, 8);
66 0 : src_ag = _mm_andnot_si128(rb_mask, src_ag);
67 0 : dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
68 :
69 : // Combine back into RGBA.
70 0 : src_pixel = _mm_or_si128(src_rb, src_ag);
71 0 : dst_pixel = _mm_or_si128(dst_rb, dst_ag);
72 :
73 : // Add result
74 0 : __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
75 0 : _mm_store_si128(d, result);
76 0 : s++;
77 0 : d++;
78 0 : count -= 4;
79 : }
80 0 : src = reinterpret_cast<const SkPMColor*>(s);
81 0 : dst = reinterpret_cast<SkPMColor*>(d);
82 : }
83 :
84 0 : while (count > 0) {
85 0 : *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
86 0 : src++;
87 0 : dst++;
88 0 : count--;
89 : }
90 : }
91 :
92 0 : void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
93 : const SkPMColor* SK_RESTRICT src,
94 : int count, U8CPU alpha) {
95 0 : SkASSERT(alpha == 255);
96 0 : if (count <= 0) {
97 0 : return;
98 : }
99 :
100 0 : if (count >= 4) {
101 0 : SkASSERT(((size_t)dst & 0x03) == 0);
102 0 : while (((size_t)dst & 0x0F) != 0) {
103 0 : *dst = SkPMSrcOver(*src, *dst);
104 0 : src++;
105 0 : dst++;
106 0 : count--;
107 : }
108 :
109 0 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
110 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
111 : #ifdef SK_USE_ACCURATE_BLENDING
112 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
113 : __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
114 : __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
115 : while (count >= 4) {
116 : // Load 4 pixels
117 : __m128i src_pixel = _mm_loadu_si128(s);
118 : __m128i dst_pixel = _mm_load_si128(d);
119 :
120 : __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
121 : __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
122 : // Shift alphas down to lower 8 bits of each quad.
123 : __m128i alpha = _mm_srli_epi32(src_pixel, 24);
124 :
125 : // Copy alpha to upper 3rd byte of each quad
126 : alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
127 :
128 : // Subtract alphas from 255, to get 0..255
129 : alpha = _mm_sub_epi16(c_255, alpha);
130 :
131 : // Multiply by red and blue by src alpha.
132 : dst_rb = _mm_mullo_epi16(dst_rb, alpha);
133 : // Multiply by alpha and green by src alpha.
134 : dst_ag = _mm_mullo_epi16(dst_ag, alpha);
135 :
136 : // dst_rb_low = (dst_rb >> 8)
137 : __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
138 : __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
139 :
140 : // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
141 : dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
142 : dst_rb = _mm_add_epi16(dst_rb, c_128);
143 : dst_rb = _mm_srli_epi16(dst_rb, 8);
144 :
145 : // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
146 : dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
147 : dst_ag = _mm_add_epi16(dst_ag, c_128);
148 : dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
149 :
150 : // Combine back into RGBA.
151 : dst_pixel = _mm_or_si128(dst_rb, dst_ag);
152 :
153 : // Add result
154 : __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
155 : _mm_store_si128(d, result);
156 : s++;
157 : d++;
158 : count -= 4;
159 : }
160 : #else
161 0 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
162 0 : __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
163 0 : while (count >= 4) {
164 : // Load 4 pixels
165 0 : __m128i src_pixel = _mm_loadu_si128(s);
166 0 : __m128i dst_pixel = _mm_load_si128(d);
167 :
168 0 : __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
169 0 : __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
170 :
171 : // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
172 0 : __m128i alpha = _mm_srli_epi16(src_pixel, 8);
173 :
174 : // (a0, a0, a1, a1, a2, g2, a3, g3)
175 0 : alpha = _mm_shufflehi_epi16(alpha, 0xF5);
176 :
177 : // (a0, a0, a1, a1, a2, a2, a3, a3)
178 0 : alpha = _mm_shufflelo_epi16(alpha, 0xF5);
179 :
180 : // Subtract alphas from 256, to get 1..256
181 0 : alpha = _mm_sub_epi16(c_256, alpha);
182 :
183 : // Multiply by red and blue by src alpha.
184 0 : dst_rb = _mm_mullo_epi16(dst_rb, alpha);
185 : // Multiply by alpha and green by src alpha.
186 0 : dst_ag = _mm_mullo_epi16(dst_ag, alpha);
187 :
188 : // Divide by 256.
189 0 : dst_rb = _mm_srli_epi16(dst_rb, 8);
190 :
191 : // Mask out high bits (already in the right place)
192 0 : dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
193 :
194 : // Combine back into RGBA.
195 0 : dst_pixel = _mm_or_si128(dst_rb, dst_ag);
196 :
197 : // Add result
198 0 : __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
199 0 : _mm_store_si128(d, result);
200 0 : s++;
201 0 : d++;
202 0 : count -= 4;
203 : }
204 : #endif
205 0 : src = reinterpret_cast<const SkPMColor*>(s);
206 0 : dst = reinterpret_cast<SkPMColor*>(d);
207 : }
208 :
209 0 : while (count > 0) {
210 0 : *dst = SkPMSrcOver(*src, *dst);
211 0 : src++;
212 0 : dst++;
213 0 : count--;
214 : }
215 : }
216 :
217 0 : void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
218 : const SkPMColor* SK_RESTRICT src,
219 : int count, U8CPU alpha) {
220 0 : SkASSERT(alpha <= 255);
221 0 : if (count <= 0) {
222 0 : return;
223 : }
224 :
225 0 : if (count >= 4) {
226 0 : while (((size_t)dst & 0x0F) != 0) {
227 0 : *dst = SkBlendARGB32(*src, *dst, alpha);
228 0 : src++;
229 0 : dst++;
230 0 : count--;
231 : }
232 :
233 0 : uint32_t src_scale = SkAlpha255To256(alpha);
234 :
235 0 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
236 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
237 0 : __m128i src_scale_wide = _mm_set1_epi16(src_scale);
238 0 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
239 0 : __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
240 0 : while (count >= 4) {
241 : // Load 4 pixels each of src and dest.
242 0 : __m128i src_pixel = _mm_loadu_si128(s);
243 0 : __m128i dst_pixel = _mm_load_si128(d);
244 :
245 : // Get red and blue pixels into lower byte of each word.
246 0 : __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
247 0 : __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
248 :
249 : // Get alpha and green into lower byte of each word.
250 0 : __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
251 0 : __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
252 :
253 : // Put per-pixel alpha in low byte of each word.
254 0 : __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
255 0 : dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
256 :
257 : // dst_alpha = dst_alpha * src_scale
258 0 : dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
259 :
260 : // Divide by 256.
261 0 : dst_alpha = _mm_srli_epi16(dst_alpha, 8);
262 :
263 : // Subtract alphas from 256, to get 1..256
264 0 : dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
265 :
266 : // Multiply red and blue by dst pixel alpha.
267 0 : dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
268 : // Multiply alpha and green by dst pixel alpha.
269 0 : dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
270 :
271 : // Multiply red and blue by global alpha.
272 0 : src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
273 : // Multiply alpha and green by global alpha.
274 0 : src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
275 :
276 : // Divide by 256.
277 0 : dst_rb = _mm_srli_epi16(dst_rb, 8);
278 0 : src_rb = _mm_srli_epi16(src_rb, 8);
279 :
280 : // Mask out low bits (goodies already in the right place; no need to divide)
281 0 : dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
282 0 : src_ag = _mm_andnot_si128(rb_mask, src_ag);
283 :
284 : // Combine back into RGBA.
285 0 : dst_pixel = _mm_or_si128(dst_rb, dst_ag);
286 0 : src_pixel = _mm_or_si128(src_rb, src_ag);
287 :
288 : // Add two pixels into result.
289 0 : __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
290 0 : _mm_store_si128(d, result);
291 0 : s++;
292 0 : d++;
293 0 : count -= 4;
294 : }
295 0 : src = reinterpret_cast<const SkPMColor*>(s);
296 0 : dst = reinterpret_cast<SkPMColor*>(d);
297 : }
298 :
299 0 : while (count > 0) {
300 0 : *dst = SkBlendARGB32(*src, *dst, alpha);
301 0 : src++;
302 0 : dst++;
303 0 : count--;
304 : }
305 : }
306 :
307 : /* SSE2 version of Color32()
308 : * portable version is in core/SkBlitRow_D32.cpp
309 : */
310 0 : void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
311 : SkPMColor color) {
312 :
313 0 : if (count <= 0) {
314 0 : return;
315 : }
316 :
317 0 : if (0 == color) {
318 0 : if (src != dst) {
319 0 : memcpy(dst, src, count * sizeof(SkPMColor));
320 : }
321 0 : return;
322 : }
323 :
324 0 : unsigned colorA = SkGetPackedA32(color);
325 0 : if (255 == colorA) {
326 0 : sk_memset32(dst, color, count);
327 : } else {
328 0 : unsigned scale = 256 - SkAlpha255To256(colorA);
329 :
330 0 : if (count >= 4) {
331 0 : SkASSERT(((size_t)dst & 0x03) == 0);
332 0 : while (((size_t)dst & 0x0F) != 0) {
333 0 : *dst = color + SkAlphaMulQ(*src, scale);
334 0 : src++;
335 0 : dst++;
336 0 : count--;
337 : }
338 :
339 0 : const __m128i *s = reinterpret_cast<const __m128i*>(src);
340 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
341 0 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
342 0 : __m128i src_scale_wide = _mm_set1_epi16(scale);
343 0 : __m128i color_wide = _mm_set1_epi32(color);
344 0 : while (count >= 4) {
345 : // Load 4 pixels each of src and dest.
346 0 : __m128i src_pixel = _mm_loadu_si128(s);
347 :
348 : // Get red and blue pixels into lower byte of each word.
349 0 : __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
350 :
351 : // Get alpha and green into lower byte of each word.
352 0 : __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
353 :
354 : // Multiply by scale.
355 0 : src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
356 0 : src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
357 :
358 : // Divide by 256.
359 0 : src_rb = _mm_srli_epi16(src_rb, 8);
360 0 : src_ag = _mm_andnot_si128(rb_mask, src_ag);
361 :
362 : // Combine back into RGBA.
363 0 : src_pixel = _mm_or_si128(src_rb, src_ag);
364 :
365 : // Add color to result.
366 0 : __m128i result = _mm_add_epi8(color_wide, src_pixel);
367 :
368 : // Store result.
369 0 : _mm_store_si128(d, result);
370 0 : s++;
371 0 : d++;
372 0 : count -= 4;
373 : }
374 0 : src = reinterpret_cast<const SkPMColor*>(s);
375 0 : dst = reinterpret_cast<SkPMColor*>(d);
376 : }
377 :
378 0 : while (count > 0) {
379 0 : *dst = color + SkAlphaMulQ(*src, scale);
380 0 : src += 1;
381 0 : dst += 1;
382 0 : count--;
383 : }
384 : }
385 : }
386 :
387 0 : void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
388 : size_t maskRB, SkColor origColor,
389 : int width, int height)
390 : {
391 0 : SkPMColor color = SkPreMultiplyColor(origColor);
392 0 : size_t dstOffset = dstRB - (width << 2);
393 0 : size_t maskOffset = maskRB - width;
394 0 : SkPMColor* dst = (SkPMColor *)device;
395 0 : const uint8_t* mask = (const uint8_t*)maskPtr;
396 0 : do {
397 0 : int count = width;
398 0 : if (count >= 4) {
399 0 : while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
400 0 : *dst = SkBlendARGB32(color, *dst, *mask);
401 0 : mask++;
402 0 : dst++;
403 0 : count--;
404 : }
405 0 : __m128i *d = reinterpret_cast<__m128i*>(dst);
406 0 : __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
407 0 : __m128i c_256 = _mm_set1_epi16(256);
408 0 : __m128i c_1 = _mm_set1_epi16(1);
409 0 : __m128i src_pixel = _mm_set1_epi32(color);
410 0 : while (count >= 4) {
411 : // Load 4 pixels each of src and dest.
412 0 : __m128i dst_pixel = _mm_load_si128(d);
413 :
414 : //set the aphla value
415 0 : __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
416 0 : 0, *(mask+3),0, \
417 0 : *(mask+2),0, *(mask+2),\
418 0 : 0,*(mask+1), 0,*(mask+1),\
419 0 : 0, *mask,0,*mask);
420 :
421 : //call SkAlpha255To256()
422 0 : src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
423 :
424 : // Get red and blue pixels into lower byte of each word.
425 0 : __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
426 0 : __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
427 :
428 : // Get alpha and green into lower byte of each word.
429 0 : __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
430 0 : __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
431 :
432 : // Put per-pixel alpha in low byte of each word.
433 0 : __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
434 0 : dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
435 :
436 : // dst_alpha = dst_alpha * src_scale
437 0 : dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
438 :
439 : // Divide by 256.
440 0 : dst_alpha = _mm_srli_epi16(dst_alpha, 8);
441 :
442 : // Subtract alphas from 256, to get 1..256
443 0 : dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
444 : // Multiply red and blue by dst pixel alpha.
445 0 : dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
446 : // Multiply alpha and green by dst pixel alpha.
447 0 : dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
448 :
449 : // Multiply red and blue by global alpha.
450 0 : src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
451 : // Multiply alpha and green by global alpha.
452 0 : src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
453 : // Divide by 256.
454 0 : dst_rb = _mm_srli_epi16(dst_rb, 8);
455 0 : src_rb = _mm_srli_epi16(src_rb, 8);
456 :
457 : // Mask out low bits (goodies already in the right place; no need to divide)
458 0 : dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
459 0 : src_ag = _mm_andnot_si128(rb_mask, src_ag);
460 :
461 : // Combine back into RGBA.
462 0 : dst_pixel = _mm_or_si128(dst_rb, dst_ag);
463 0 : __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
464 :
465 : // Add two pixels into result.
466 0 : __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
467 0 : _mm_store_si128(d, result);
468 : // load the next 4 pixel
469 0 : mask = mask + 4;
470 0 : d++;
471 0 : count -= 4;
472 : }
473 0 : dst = reinterpret_cast<SkPMColor *>(d);
474 : }
475 0 : while(count > 0) {
476 0 : *dst= SkBlendARGB32(color, *dst, *mask);
477 0 : dst += 1;
478 0 : mask++;
479 0 : count --;
480 : }
481 0 : dst = (SkPMColor *)((char*)dst + dstOffset);
482 0 : mask += maskOffset;
483 : } while (--height != 0);
484 0 : }
|