1 : /*
2 : * Copyright © 2008 Rodrigo Kumpera
3 : * Copyright © 2008 André Tupinambá
4 : *
5 : * Permission to use, copy, modify, distribute, and sell this software and its
6 : * documentation for any purpose is hereby granted without fee, provided that
7 : * the above copyright notice appear in all copies and that both that
8 : * copyright notice and this permission notice appear in supporting
9 : * documentation, and that the name of Red Hat not be used in advertising or
10 : * publicity pertaining to distribution of the software without specific,
11 : * written prior permission. Red Hat makes no representations about the
12 : * suitability of this software for any purpose. It is provided "as is"
13 : * without express or implied warranty.
14 : *
15 : * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 : * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 : * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 : * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 : * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 : * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 : * SOFTWARE.
23 : *
24 : * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 : * André Tupinambá (andrelrt@gmail.com)
26 : *
27 : * Based on work by Owen Taylor and Søren Sandmann
28 : */
29 : #ifdef HAVE_CONFIG_H
30 : #include <config.h>
31 : #endif
32 :
33 : #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 : #include <emmintrin.h> /* for SSE2 intrinsics */
35 : #include "pixman-private.h"
36 : #include "pixman-combine32.h"
37 : #include "pixman-fast-path.h"
38 :
39 : static __m128i mask_0080;
40 : static __m128i mask_00ff;
41 : static __m128i mask_0101;
42 : static __m128i mask_ffff;
43 : static __m128i mask_ff000000;
44 : static __m128i mask_alpha;
45 :
46 : static __m128i mask_565_r;
47 : static __m128i mask_565_g1, mask_565_g2;
48 : static __m128i mask_565_b;
49 : static __m128i mask_red;
50 : static __m128i mask_green;
51 : static __m128i mask_blue;
52 :
53 : static __m128i mask_565_fix_rb;
54 : static __m128i mask_565_fix_g;
55 :
56 : static force_inline __m128i
57 : unpack_32_1x128 (uint32_t data)
58 : {
59 0 : return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
60 : }
61 :
62 : static force_inline void
63 : unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
64 : {
65 0 : *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
66 0 : *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
67 : }
68 :
69 : static force_inline __m128i
70 : unpack_565_to_8888 (__m128i lo)
71 : {
72 : __m128i r, g, b, rb, t;
73 :
74 0 : r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
75 0 : g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
76 0 : b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
77 :
78 0 : rb = _mm_or_si128 (r, b);
79 0 : t = _mm_and_si128 (rb, mask_565_fix_rb);
80 0 : t = _mm_srli_epi32 (t, 5);
81 0 : rb = _mm_or_si128 (rb, t);
82 :
83 0 : t = _mm_and_si128 (g, mask_565_fix_g);
84 0 : t = _mm_srli_epi32 (t, 6);
85 0 : g = _mm_or_si128 (g, t);
86 :
87 0 : return _mm_or_si128 (rb, g);
88 : }
89 :
90 : static force_inline void
91 : unpack_565_128_4x128 (__m128i data,
92 : __m128i* data0,
93 : __m128i* data1,
94 : __m128i* data2,
95 : __m128i* data3)
96 : {
97 : __m128i lo, hi;
98 :
99 0 : lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
100 0 : hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101 :
102 0 : lo = unpack_565_to_8888 (lo);
103 0 : hi = unpack_565_to_8888 (hi);
104 :
105 0 : unpack_128_2x128 (lo, data0, data1);
106 0 : unpack_128_2x128 (hi, data2, data3);
107 : }
108 :
109 : static force_inline uint16_t
110 : pack_565_32_16 (uint32_t pixel)
111 : {
112 0 : return (uint16_t) (((pixel >> 8) & 0xf800) |
113 0 : ((pixel >> 5) & 0x07e0) |
114 0 : ((pixel >> 3) & 0x001f));
115 : }
116 :
117 : static force_inline __m128i
118 : pack_2x128_128 (__m128i lo, __m128i hi)
119 : {
120 : return _mm_packus_epi16 (lo, hi);
121 : }
122 :
123 : static force_inline __m128i
124 : pack_565_2x128_128 (__m128i lo, __m128i hi)
125 : {
126 : __m128i data;
127 : __m128i r, g1, g2, b;
128 :
129 0 : data = pack_2x128_128 (lo, hi);
130 :
131 0 : r = _mm_and_si128 (data, mask_565_r);
132 0 : g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
133 0 : g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
134 0 : b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
135 :
136 0 : return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
137 : }
138 :
139 : static force_inline __m128i
140 : pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
141 : {
142 0 : return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
143 : pack_565_2x128_128 (*xmm2, *xmm3));
144 : }
145 :
146 : static force_inline int
147 : is_opaque (__m128i x)
148 : {
149 0 : __m128i ffs = _mm_cmpeq_epi8 (x, x);
150 :
151 0 : return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
152 : }
153 :
154 : static force_inline int
155 : is_zero (__m128i x)
156 : {
157 0 : return _mm_movemask_epi8 (
158 : _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
159 : }
160 :
161 : static force_inline int
162 : is_transparent (__m128i x)
163 : {
164 0 : return (_mm_movemask_epi8 (
165 0 : _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
166 : }
167 :
168 : static force_inline __m128i
169 : expand_pixel_32_1x128 (uint32_t data)
170 : {
171 0 : return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
172 : }
173 :
174 : static force_inline __m128i
175 : expand_alpha_1x128 (__m128i data)
176 : {
177 0 : return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
178 : _MM_SHUFFLE (3, 3, 3, 3)),
179 : _MM_SHUFFLE (3, 3, 3, 3));
180 : }
181 :
182 : static force_inline void
183 : expand_alpha_2x128 (__m128i data_lo,
184 : __m128i data_hi,
185 : __m128i* alpha_lo,
186 : __m128i* alpha_hi)
187 : {
188 : __m128i lo, hi;
189 :
190 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
191 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
192 :
193 0 : *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
194 0 : *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
195 : }
196 :
197 : static force_inline void
198 : expand_alpha_rev_2x128 (__m128i data_lo,
199 : __m128i data_hi,
200 : __m128i* alpha_lo,
201 : __m128i* alpha_hi)
202 : {
203 : __m128i lo, hi;
204 :
205 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
206 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
207 0 : *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
208 0 : *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
209 : }
210 :
211 : static force_inline void
212 : pix_multiply_2x128 (__m128i* data_lo,
213 : __m128i* data_hi,
214 : __m128i* alpha_lo,
215 : __m128i* alpha_hi,
216 : __m128i* ret_lo,
217 : __m128i* ret_hi)
218 : {
219 : __m128i lo, hi;
220 :
221 0 : lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
222 0 : hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
223 0 : lo = _mm_adds_epu16 (lo, mask_0080);
224 0 : hi = _mm_adds_epu16 (hi, mask_0080);
225 0 : *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
226 0 : *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
227 : }
228 :
229 : static force_inline void
230 : pix_add_multiply_2x128 (__m128i* src_lo,
231 : __m128i* src_hi,
232 : __m128i* alpha_dst_lo,
233 : __m128i* alpha_dst_hi,
234 : __m128i* dst_lo,
235 : __m128i* dst_hi,
236 : __m128i* alpha_src_lo,
237 : __m128i* alpha_src_hi,
238 : __m128i* ret_lo,
239 : __m128i* ret_hi)
240 : {
241 : __m128i t1_lo, t1_hi;
242 : __m128i t2_lo, t2_hi;
243 :
244 : pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
245 : pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
246 :
247 0 : *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
248 0 : *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
249 : }
250 :
251 : static force_inline void
252 : negate_2x128 (__m128i data_lo,
253 : __m128i data_hi,
254 : __m128i* neg_lo,
255 : __m128i* neg_hi)
256 : {
257 0 : *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
258 0 : *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
259 : }
260 :
261 : static force_inline void
262 : invert_colors_2x128 (__m128i data_lo,
263 : __m128i data_hi,
264 : __m128i* inv_lo,
265 : __m128i* inv_hi)
266 : {
267 : __m128i lo, hi;
268 :
269 0 : lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
270 0 : hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
271 0 : *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
272 0 : *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
273 : }
274 :
275 : static force_inline void
276 : over_2x128 (__m128i* src_lo,
277 : __m128i* src_hi,
278 : __m128i* alpha_lo,
279 : __m128i* alpha_hi,
280 : __m128i* dst_lo,
281 : __m128i* dst_hi)
282 : {
283 : __m128i t1, t2;
284 :
285 0 : negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
286 :
287 : pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
288 :
289 0 : *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
290 0 : *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
291 : }
292 :
293 : static force_inline void
294 : over_rev_non_pre_2x128 (__m128i src_lo,
295 : __m128i src_hi,
296 : __m128i* dst_lo,
297 : __m128i* dst_hi)
298 : {
299 : __m128i lo, hi;
300 : __m128i alpha_lo, alpha_hi;
301 :
302 0 : expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
303 :
304 0 : lo = _mm_or_si128 (alpha_lo, mask_alpha);
305 0 : hi = _mm_or_si128 (alpha_hi, mask_alpha);
306 :
307 0 : invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
308 :
309 : pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
310 :
311 : over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
312 : }
313 :
314 : static force_inline void
315 : in_over_2x128 (__m128i* src_lo,
316 : __m128i* src_hi,
317 : __m128i* alpha_lo,
318 : __m128i* alpha_hi,
319 : __m128i* mask_lo,
320 : __m128i* mask_hi,
321 : __m128i* dst_lo,
322 : __m128i* dst_hi)
323 : {
324 : __m128i s_lo, s_hi;
325 : __m128i a_lo, a_hi;
326 :
327 : pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
328 : pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
329 :
330 : over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
331 : }
332 :
333 : /* load 4 pixels from a 16-byte boundary aligned address */
334 : static force_inline __m128i
335 : load_128_aligned (__m128i* src)
336 : {
337 : return _mm_load_si128 (src);
338 : }
339 :
340 : /* load 4 pixels from a unaligned address */
341 : static force_inline __m128i
342 : load_128_unaligned (const __m128i* src)
343 : {
344 : return _mm_loadu_si128 (src);
345 : }
346 :
347 : /* save 4 pixels using Write Combining memory on a 16-byte
348 : * boundary aligned address
349 : */
350 : static force_inline void
351 : save_128_write_combining (__m128i* dst,
352 : __m128i data)
353 : {
354 : _mm_stream_si128 (dst, data);
355 : }
356 :
357 : /* save 4 pixels on a 16-byte boundary aligned address */
358 : static force_inline void
359 : save_128_aligned (__m128i* dst,
360 : __m128i data)
361 : {
362 : _mm_store_si128 (dst, data);
363 : }
364 :
365 : /* save 4 pixels on a unaligned address */
366 : static force_inline void
367 : save_128_unaligned (__m128i* dst,
368 : __m128i data)
369 : {
370 : _mm_storeu_si128 (dst, data);
371 : }
372 :
373 : static force_inline __m128i
374 : load_32_1x128 (uint32_t data)
375 : {
376 0 : return _mm_cvtsi32_si128 (data);
377 : }
378 :
379 : static force_inline __m128i
380 : expand_alpha_rev_1x128 (__m128i data)
381 : {
382 0 : return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
383 : }
384 :
385 : static force_inline __m128i
386 : expand_pixel_8_1x128 (uint8_t data)
387 : {
388 0 : return _mm_shufflelo_epi16 (
389 : unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
390 : }
391 :
392 : static force_inline __m128i
393 : pix_multiply_1x128 (__m128i data,
394 : __m128i alpha)
395 : {
396 0 : return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
397 : mask_0080),
398 : mask_0101);
399 : }
400 :
401 : static force_inline __m128i
402 : pix_add_multiply_1x128 (__m128i* src,
403 : __m128i* alpha_dst,
404 : __m128i* dst,
405 : __m128i* alpha_src)
406 : {
407 0 : __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
408 0 : __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
409 :
410 0 : return _mm_adds_epu8 (t1, t2);
411 : }
412 :
413 : static force_inline __m128i
414 : negate_1x128 (__m128i data)
415 : {
416 0 : return _mm_xor_si128 (data, mask_00ff);
417 : }
418 :
419 : static force_inline __m128i
420 : invert_colors_1x128 (__m128i data)
421 : {
422 0 : return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
423 : }
424 :
425 : static force_inline __m128i
426 : over_1x128 (__m128i src, __m128i alpha, __m128i dst)
427 : {
428 : return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
429 : }
430 :
431 : static force_inline __m128i
432 : in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
433 : {
434 0 : return over_1x128 (pix_multiply_1x128 (*src, *mask),
435 : pix_multiply_1x128 (*alpha, *mask),
436 : *dst);
437 : }
438 :
439 : static force_inline __m128i
440 : over_rev_non_pre_1x128 (__m128i src, __m128i dst)
441 : {
442 0 : __m128i alpha = expand_alpha_1x128 (src);
443 :
444 0 : return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
445 : _mm_or_si128 (alpha, mask_alpha)),
446 : alpha,
447 : dst);
448 : }
449 :
450 : static force_inline uint32_t
451 : pack_1x128_32 (__m128i data)
452 : {
453 0 : return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
454 : }
455 :
456 : static force_inline __m128i
457 : expand565_16_1x128 (uint16_t pixel)
458 : {
459 0 : __m128i m = _mm_cvtsi32_si128 (pixel);
460 :
461 0 : m = unpack_565_to_8888 (m);
462 :
463 0 : return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
464 : }
465 :
466 : static force_inline uint32_t
467 : core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
468 : {
469 : uint8_t a;
470 : __m128i xmms;
471 :
472 0 : a = src >> 24;
473 :
474 0 : if (a == 0xff)
475 : {
476 0 : return src;
477 : }
478 0 : else if (src)
479 : {
480 0 : xmms = unpack_32_1x128 (src);
481 0 : return pack_1x128_32 (
482 : over_1x128 (xmms, expand_alpha_1x128 (xmms),
483 : unpack_32_1x128 (dst)));
484 : }
485 :
486 0 : return dst;
487 : }
488 :
489 : static force_inline uint32_t
490 : combine1 (const uint32_t *ps, const uint32_t *pm)
491 : {
492 0 : uint32_t s = *ps;
493 :
494 0 : if (pm)
495 : {
496 : __m128i ms, mm;
497 :
498 0 : mm = unpack_32_1x128 (*pm);
499 0 : mm = expand_alpha_1x128 (mm);
500 :
501 0 : ms = unpack_32_1x128 (s);
502 0 : ms = pix_multiply_1x128 (ms, mm);
503 :
504 0 : s = pack_1x128_32 (ms);
505 : }
506 :
507 0 : return s;
508 : }
509 :
510 : static force_inline __m128i
511 : combine4 (const __m128i *ps, const __m128i *pm)
512 : {
513 : __m128i xmm_src_lo, xmm_src_hi;
514 : __m128i xmm_msk_lo, xmm_msk_hi;
515 : __m128i s;
516 :
517 0 : if (pm)
518 : {
519 0 : xmm_msk_lo = load_128_unaligned (pm);
520 :
521 0 : if (is_transparent (xmm_msk_lo))
522 : return _mm_setzero_si128 ();
523 : }
524 :
525 0 : s = load_128_unaligned (ps);
526 :
527 0 : if (pm)
528 : {
529 0 : unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
530 0 : unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
531 :
532 0 : expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
533 :
534 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
535 : &xmm_msk_lo, &xmm_msk_hi,
536 : &xmm_src_lo, &xmm_src_hi);
537 :
538 0 : s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
539 : }
540 :
541 0 : return s;
542 : }
543 :
544 : static force_inline void
545 : core_combine_over_u_sse2_mask (uint32_t * pd,
546 : const uint32_t* ps,
547 : const uint32_t* pm,
548 : int w)
549 : {
550 : uint32_t s, d;
551 :
552 : /* Align dst on a 16-byte boundary */
553 0 : while (w && ((unsigned long)pd & 15))
554 : {
555 0 : d = *pd;
556 : s = combine1 (ps, pm);
557 :
558 0 : if (s)
559 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
560 0 : pd++;
561 0 : ps++;
562 0 : pm++;
563 0 : w--;
564 : }
565 :
566 0 : while (w >= 4)
567 : {
568 0 : __m128i mask = load_128_unaligned ((__m128i *)pm);
569 :
570 0 : if (!is_zero (mask))
571 : {
572 : __m128i src;
573 : __m128i src_hi, src_lo;
574 : __m128i mask_hi, mask_lo;
575 : __m128i alpha_hi, alpha_lo;
576 :
577 0 : src = load_128_unaligned ((__m128i *)ps);
578 :
579 0 : if (is_opaque (_mm_and_si128 (src, mask)))
580 : {
581 0 : save_128_aligned ((__m128i *)pd, src);
582 : }
583 : else
584 : {
585 0 : __m128i dst = load_128_aligned ((__m128i *)pd);
586 : __m128i dst_hi, dst_lo;
587 :
588 0 : unpack_128_2x128 (mask, &mask_lo, &mask_hi);
589 0 : unpack_128_2x128 (src, &src_lo, &src_hi);
590 :
591 0 : expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
592 : pix_multiply_2x128 (&src_lo, &src_hi,
593 : &mask_lo, &mask_hi,
594 : &src_lo, &src_hi);
595 :
596 0 : unpack_128_2x128 (dst, &dst_lo, &dst_hi);
597 :
598 0 : expand_alpha_2x128 (src_lo, src_hi,
599 : &alpha_lo, &alpha_hi);
600 :
601 : over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
602 : &dst_lo, &dst_hi);
603 :
604 0 : save_128_aligned (
605 : (__m128i *)pd,
606 : pack_2x128_128 (dst_lo, dst_hi));
607 : }
608 : }
609 :
610 0 : pm += 4;
611 0 : ps += 4;
612 0 : pd += 4;
613 0 : w -= 4;
614 : }
615 0 : while (w)
616 : {
617 0 : d = *pd;
618 : s = combine1 (ps, pm);
619 :
620 0 : if (s)
621 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
622 0 : pd++;
623 0 : ps++;
624 0 : pm++;
625 :
626 0 : w--;
627 : }
628 : }
629 :
630 : static force_inline void
631 : core_combine_over_u_sse2_no_mask (uint32_t * pd,
632 : const uint32_t* ps,
633 : int w)
634 : {
635 : uint32_t s, d;
636 :
637 : /* Align dst on a 16-byte boundary */
638 0 : while (w && ((unsigned long)pd & 15))
639 : {
640 0 : d = *pd;
641 0 : s = *ps;
642 :
643 0 : if (s)
644 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
645 0 : pd++;
646 0 : ps++;
647 0 : w--;
648 : }
649 :
650 0 : while (w >= 4)
651 : {
652 : __m128i src;
653 : __m128i src_hi, src_lo, dst_hi, dst_lo;
654 : __m128i alpha_hi, alpha_lo;
655 :
656 0 : src = load_128_unaligned ((__m128i *)ps);
657 :
658 0 : if (!is_zero (src))
659 : {
660 0 : if (is_opaque (src))
661 : {
662 0 : save_128_aligned ((__m128i *)pd, src);
663 : }
664 : else
665 : {
666 0 : __m128i dst = load_128_aligned ((__m128i *)pd);
667 :
668 0 : unpack_128_2x128 (src, &src_lo, &src_hi);
669 0 : unpack_128_2x128 (dst, &dst_lo, &dst_hi);
670 :
671 0 : expand_alpha_2x128 (src_lo, src_hi,
672 : &alpha_lo, &alpha_hi);
673 : over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
674 : &dst_lo, &dst_hi);
675 :
676 0 : save_128_aligned (
677 : (__m128i *)pd,
678 : pack_2x128_128 (dst_lo, dst_hi));
679 : }
680 : }
681 :
682 0 : ps += 4;
683 0 : pd += 4;
684 0 : w -= 4;
685 : }
686 0 : while (w)
687 : {
688 0 : d = *pd;
689 0 : s = *ps;
690 :
691 0 : if (s)
692 0 : *pd = core_combine_over_u_pixel_sse2 (s, d);
693 0 : pd++;
694 0 : ps++;
695 :
696 0 : w--;
697 : }
698 : }
699 :
700 : static force_inline void
701 0 : sse2_combine_over_u (pixman_implementation_t *imp,
702 : pixman_op_t op,
703 : uint32_t * pd,
704 : const uint32_t * ps,
705 : const uint32_t * pm,
706 : int w)
707 : {
708 0 : if (pm)
709 : core_combine_over_u_sse2_mask (pd, ps, pm, w);
710 : else
711 : core_combine_over_u_sse2_no_mask (pd, ps, w);
712 0 : }
713 :
714 : static void
715 0 : sse2_combine_over_reverse_u (pixman_implementation_t *imp,
716 : pixman_op_t op,
717 : uint32_t * pd,
718 : const uint32_t * ps,
719 : const uint32_t * pm,
720 : int w)
721 : {
722 : uint32_t s, d;
723 :
724 : __m128i xmm_dst_lo, xmm_dst_hi;
725 : __m128i xmm_src_lo, xmm_src_hi;
726 : __m128i xmm_alpha_lo, xmm_alpha_hi;
727 :
728 : /* Align dst on a 16-byte boundary */
729 0 : while (w &&
730 0 : ((unsigned long)pd & 15))
731 : {
732 0 : d = *pd;
733 : s = combine1 (ps, pm);
734 :
735 0 : *pd++ = core_combine_over_u_pixel_sse2 (d, s);
736 0 : w--;
737 0 : ps++;
738 0 : if (pm)
739 0 : pm++;
740 : }
741 :
742 0 : while (w >= 4)
743 : {
744 : /* I'm loading unaligned because I'm not sure
745 : * about the address alignment.
746 : */
747 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
748 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
749 :
750 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
751 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
752 :
753 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
754 : &xmm_alpha_lo, &xmm_alpha_hi);
755 :
756 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
757 : &xmm_alpha_lo, &xmm_alpha_hi,
758 : &xmm_src_lo, &xmm_src_hi);
759 :
760 : /* rebuid the 4 pixel data and save*/
761 0 : save_128_aligned ((__m128i*)pd,
762 : pack_2x128_128 (xmm_src_lo, xmm_src_hi));
763 :
764 0 : w -= 4;
765 0 : ps += 4;
766 0 : pd += 4;
767 :
768 0 : if (pm)
769 0 : pm += 4;
770 : }
771 :
772 0 : while (w)
773 : {
774 0 : d = *pd;
775 : s = combine1 (ps, pm);
776 :
777 0 : *pd++ = core_combine_over_u_pixel_sse2 (d, s);
778 0 : ps++;
779 0 : w--;
780 0 : if (pm)
781 0 : pm++;
782 : }
783 0 : }
784 :
785 : static force_inline uint32_t
786 : core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
787 : {
788 0 : uint32_t maska = src >> 24;
789 :
790 0 : if (maska == 0)
791 : {
792 0 : return 0;
793 : }
794 0 : else if (maska != 0xff)
795 : {
796 : return pack_1x128_32 (
797 : pix_multiply_1x128 (unpack_32_1x128 (dst),
798 : expand_alpha_1x128 (unpack_32_1x128 (src))));
799 : }
800 :
801 0 : return dst;
802 : }
803 :
804 : static void
805 0 : sse2_combine_in_u (pixman_implementation_t *imp,
806 : pixman_op_t op,
807 : uint32_t * pd,
808 : const uint32_t * ps,
809 : const uint32_t * pm,
810 : int w)
811 : {
812 : uint32_t s, d;
813 :
814 : __m128i xmm_src_lo, xmm_src_hi;
815 : __m128i xmm_dst_lo, xmm_dst_hi;
816 :
817 0 : while (w && ((unsigned long) pd & 15))
818 : {
819 : s = combine1 (ps, pm);
820 0 : d = *pd;
821 :
822 0 : *pd++ = core_combine_in_u_pixel_sse2 (d, s);
823 0 : w--;
824 0 : ps++;
825 0 : if (pm)
826 0 : pm++;
827 : }
828 :
829 0 : while (w >= 4)
830 : {
831 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
832 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
833 :
834 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
835 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
836 :
837 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
838 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
839 : &xmm_dst_lo, &xmm_dst_hi,
840 : &xmm_dst_lo, &xmm_dst_hi);
841 :
842 0 : save_128_aligned ((__m128i*)pd,
843 : pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
844 :
845 0 : ps += 4;
846 0 : pd += 4;
847 0 : w -= 4;
848 0 : if (pm)
849 0 : pm += 4;
850 : }
851 :
852 0 : while (w)
853 : {
854 : s = combine1 (ps, pm);
855 0 : d = *pd;
856 :
857 0 : *pd++ = core_combine_in_u_pixel_sse2 (d, s);
858 0 : w--;
859 0 : ps++;
860 0 : if (pm)
861 0 : pm++;
862 : }
863 0 : }
864 :
865 : static void
866 0 : sse2_combine_in_reverse_u (pixman_implementation_t *imp,
867 : pixman_op_t op,
868 : uint32_t * pd,
869 : const uint32_t * ps,
870 : const uint32_t * pm,
871 : int w)
872 : {
873 : uint32_t s, d;
874 :
875 : __m128i xmm_src_lo, xmm_src_hi;
876 : __m128i xmm_dst_lo, xmm_dst_hi;
877 :
878 0 : while (w && ((unsigned long) pd & 15))
879 : {
880 : s = combine1 (ps, pm);
881 0 : d = *pd;
882 :
883 0 : *pd++ = core_combine_in_u_pixel_sse2 (s, d);
884 0 : ps++;
885 0 : w--;
886 0 : if (pm)
887 0 : pm++;
888 : }
889 :
890 0 : while (w >= 4)
891 : {
892 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
893 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
894 :
895 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
896 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
897 :
898 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
899 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
900 : &xmm_src_lo, &xmm_src_hi,
901 : &xmm_dst_lo, &xmm_dst_hi);
902 :
903 0 : save_128_aligned (
904 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
905 :
906 0 : ps += 4;
907 0 : pd += 4;
908 0 : w -= 4;
909 0 : if (pm)
910 0 : pm += 4;
911 : }
912 :
913 0 : while (w)
914 : {
915 : s = combine1 (ps, pm);
916 0 : d = *pd;
917 :
918 0 : *pd++ = core_combine_in_u_pixel_sse2 (s, d);
919 0 : w--;
920 0 : ps++;
921 0 : if (pm)
922 0 : pm++;
923 : }
924 0 : }
925 :
926 : static void
927 0 : sse2_combine_out_reverse_u (pixman_implementation_t *imp,
928 : pixman_op_t op,
929 : uint32_t * pd,
930 : const uint32_t * ps,
931 : const uint32_t * pm,
932 : int w)
933 : {
934 0 : while (w && ((unsigned long) pd & 15))
935 : {
936 : uint32_t s = combine1 (ps, pm);
937 0 : uint32_t d = *pd;
938 :
939 0 : *pd++ = pack_1x128_32 (
940 : pix_multiply_1x128 (
941 : unpack_32_1x128 (d), negate_1x128 (
942 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
943 :
944 0 : if (pm)
945 0 : pm++;
946 0 : ps++;
947 0 : w--;
948 : }
949 :
950 0 : while (w >= 4)
951 : {
952 : __m128i xmm_src_lo, xmm_src_hi;
953 : __m128i xmm_dst_lo, xmm_dst_hi;
954 :
955 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
956 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
957 :
958 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
959 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
960 :
961 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
962 0 : negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
963 :
964 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
965 : &xmm_src_lo, &xmm_src_hi,
966 : &xmm_dst_lo, &xmm_dst_hi);
967 :
968 0 : save_128_aligned (
969 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
970 :
971 0 : ps += 4;
972 0 : pd += 4;
973 0 : if (pm)
974 0 : pm += 4;
975 :
976 0 : w -= 4;
977 : }
978 :
979 0 : while (w)
980 : {
981 : uint32_t s = combine1 (ps, pm);
982 0 : uint32_t d = *pd;
983 :
984 0 : *pd++ = pack_1x128_32 (
985 : pix_multiply_1x128 (
986 : unpack_32_1x128 (d), negate_1x128 (
987 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
988 0 : ps++;
989 0 : if (pm)
990 0 : pm++;
991 0 : w--;
992 : }
993 0 : }
994 :
995 : static void
996 0 : sse2_combine_out_u (pixman_implementation_t *imp,
997 : pixman_op_t op,
998 : uint32_t * pd,
999 : const uint32_t * ps,
1000 : const uint32_t * pm,
1001 : int w)
1002 : {
1003 0 : while (w && ((unsigned long) pd & 15))
1004 : {
1005 : uint32_t s = combine1 (ps, pm);
1006 0 : uint32_t d = *pd;
1007 :
1008 0 : *pd++ = pack_1x128_32 (
1009 : pix_multiply_1x128 (
1010 : unpack_32_1x128 (s), negate_1x128 (
1011 : expand_alpha_1x128 (unpack_32_1x128 (d)))));
1012 0 : w--;
1013 0 : ps++;
1014 0 : if (pm)
1015 0 : pm++;
1016 : }
1017 :
1018 0 : while (w >= 4)
1019 : {
1020 : __m128i xmm_src_lo, xmm_src_hi;
1021 : __m128i xmm_dst_lo, xmm_dst_hi;
1022 :
1023 0 : xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1024 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1025 :
1026 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1027 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1028 :
1029 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1030 0 : negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1031 :
1032 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1033 : &xmm_dst_lo, &xmm_dst_hi,
1034 : &xmm_dst_lo, &xmm_dst_hi);
1035 :
1036 0 : save_128_aligned (
1037 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1038 :
1039 0 : ps += 4;
1040 0 : pd += 4;
1041 0 : w -= 4;
1042 0 : if (pm)
1043 0 : pm += 4;
1044 : }
1045 :
1046 0 : while (w)
1047 : {
1048 : uint32_t s = combine1 (ps, pm);
1049 0 : uint32_t d = *pd;
1050 :
1051 0 : *pd++ = pack_1x128_32 (
1052 : pix_multiply_1x128 (
1053 : unpack_32_1x128 (s), negate_1x128 (
1054 : expand_alpha_1x128 (unpack_32_1x128 (d)))));
1055 0 : w--;
1056 0 : ps++;
1057 0 : if (pm)
1058 0 : pm++;
1059 : }
1060 0 : }
1061 :
1062 : static force_inline uint32_t
1063 : core_combine_atop_u_pixel_sse2 (uint32_t src,
1064 : uint32_t dst)
1065 : {
1066 0 : __m128i s = unpack_32_1x128 (src);
1067 0 : __m128i d = unpack_32_1x128 (dst);
1068 :
1069 0 : __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1070 0 : __m128i da = expand_alpha_1x128 (d);
1071 :
1072 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1073 : }
1074 :
1075 : static void
1076 0 : sse2_combine_atop_u (pixman_implementation_t *imp,
1077 : pixman_op_t op,
1078 : uint32_t * pd,
1079 : const uint32_t * ps,
1080 : const uint32_t * pm,
1081 : int w)
1082 : {
1083 : uint32_t s, d;
1084 :
1085 : __m128i xmm_src_lo, xmm_src_hi;
1086 : __m128i xmm_dst_lo, xmm_dst_hi;
1087 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1088 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1089 :
1090 0 : while (w && ((unsigned long) pd & 15))
1091 : {
1092 : s = combine1 (ps, pm);
1093 0 : d = *pd;
1094 :
1095 0 : *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1096 0 : w--;
1097 0 : ps++;
1098 0 : if (pm)
1099 0 : pm++;
1100 : }
1101 :
1102 0 : while (w >= 4)
1103 : {
1104 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1105 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1106 :
1107 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1108 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1109 :
1110 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1111 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1112 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1113 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1114 :
1115 0 : negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1116 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1117 :
1118 : pix_add_multiply_2x128 (
1119 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1120 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1121 : &xmm_dst_lo, &xmm_dst_hi);
1122 :
1123 0 : save_128_aligned (
1124 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1125 :
1126 0 : ps += 4;
1127 0 : pd += 4;
1128 0 : w -= 4;
1129 0 : if (pm)
1130 0 : pm += 4;
1131 : }
1132 :
1133 0 : while (w)
1134 : {
1135 : s = combine1 (ps, pm);
1136 0 : d = *pd;
1137 :
1138 0 : *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1139 0 : w--;
1140 0 : ps++;
1141 0 : if (pm)
1142 0 : pm++;
1143 : }
1144 0 : }
1145 :
1146 : static force_inline uint32_t
1147 : core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1148 : uint32_t dst)
1149 : {
1150 0 : __m128i s = unpack_32_1x128 (src);
1151 0 : __m128i d = unpack_32_1x128 (dst);
1152 :
1153 0 : __m128i sa = expand_alpha_1x128 (s);
1154 0 : __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1155 :
1156 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1157 : }
1158 :
1159 : static void
1160 0 : sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1161 : pixman_op_t op,
1162 : uint32_t * pd,
1163 : const uint32_t * ps,
1164 : const uint32_t * pm,
1165 : int w)
1166 : {
1167 : uint32_t s, d;
1168 :
1169 : __m128i xmm_src_lo, xmm_src_hi;
1170 : __m128i xmm_dst_lo, xmm_dst_hi;
1171 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1172 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1173 :
1174 0 : while (w && ((unsigned long) pd & 15))
1175 : {
1176 : s = combine1 (ps, pm);
1177 0 : d = *pd;
1178 :
1179 0 : *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1180 0 : ps++;
1181 0 : w--;
1182 0 : if (pm)
1183 0 : pm++;
1184 : }
1185 :
1186 0 : while (w >= 4)
1187 : {
1188 0 : xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1189 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1190 :
1191 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1192 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1193 :
1194 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1195 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1196 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1197 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1198 :
1199 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1200 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1201 :
1202 : pix_add_multiply_2x128 (
1203 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1204 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1205 : &xmm_dst_lo, &xmm_dst_hi);
1206 :
1207 0 : save_128_aligned (
1208 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1209 :
1210 0 : ps += 4;
1211 0 : pd += 4;
1212 0 : w -= 4;
1213 0 : if (pm)
1214 0 : pm += 4;
1215 : }
1216 :
1217 0 : while (w)
1218 : {
1219 : s = combine1 (ps, pm);
1220 0 : d = *pd;
1221 :
1222 0 : *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1223 0 : ps++;
1224 0 : w--;
1225 0 : if (pm)
1226 0 : pm++;
1227 : }
1228 0 : }
1229 :
1230 : static force_inline uint32_t
1231 : core_combine_xor_u_pixel_sse2 (uint32_t src,
1232 : uint32_t dst)
1233 : {
1234 0 : __m128i s = unpack_32_1x128 (src);
1235 0 : __m128i d = unpack_32_1x128 (dst);
1236 :
1237 0 : __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1238 0 : __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1239 :
1240 : return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1241 : }
1242 :
1243 : static void
1244 0 : sse2_combine_xor_u (pixman_implementation_t *imp,
1245 : pixman_op_t op,
1246 : uint32_t * dst,
1247 : const uint32_t * src,
1248 : const uint32_t * mask,
1249 : int width)
1250 : {
1251 0 : int w = width;
1252 : uint32_t s, d;
1253 0 : uint32_t* pd = dst;
1254 0 : const uint32_t* ps = src;
1255 0 : const uint32_t* pm = mask;
1256 :
1257 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1258 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1259 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1260 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1261 :
1262 0 : while (w && ((unsigned long) pd & 15))
1263 : {
1264 : s = combine1 (ps, pm);
1265 0 : d = *pd;
1266 :
1267 0 : *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1268 0 : w--;
1269 0 : ps++;
1270 0 : if (pm)
1271 0 : pm++;
1272 : }
1273 :
1274 0 : while (w >= 4)
1275 : {
1276 0 : xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1277 0 : xmm_dst = load_128_aligned ((__m128i*) pd);
1278 :
1279 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1280 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1281 :
1282 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1283 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1284 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1285 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286 :
1287 0 : negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1288 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1289 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1290 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1291 :
1292 : pix_add_multiply_2x128 (
1293 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1294 : &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1295 : &xmm_dst_lo, &xmm_dst_hi);
1296 :
1297 0 : save_128_aligned (
1298 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1299 :
1300 0 : ps += 4;
1301 0 : pd += 4;
1302 0 : w -= 4;
1303 0 : if (pm)
1304 0 : pm += 4;
1305 : }
1306 :
1307 0 : while (w)
1308 : {
1309 : s = combine1 (ps, pm);
1310 0 : d = *pd;
1311 :
1312 0 : *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1313 0 : w--;
1314 0 : ps++;
1315 0 : if (pm)
1316 0 : pm++;
1317 : }
1318 0 : }
1319 :
1320 : static force_inline void
1321 0 : sse2_combine_add_u (pixman_implementation_t *imp,
1322 : pixman_op_t op,
1323 : uint32_t * dst,
1324 : const uint32_t * src,
1325 : const uint32_t * mask,
1326 : int width)
1327 : {
1328 0 : int w = width;
1329 : uint32_t s, d;
1330 0 : uint32_t* pd = dst;
1331 0 : const uint32_t* ps = src;
1332 0 : const uint32_t* pm = mask;
1333 :
1334 0 : while (w && (unsigned long)pd & 15)
1335 : {
1336 : s = combine1 (ps, pm);
1337 0 : d = *pd;
1338 :
1339 0 : ps++;
1340 0 : if (pm)
1341 0 : pm++;
1342 0 : *pd++ = _mm_cvtsi128_si32 (
1343 : _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1344 0 : w--;
1345 : }
1346 :
1347 0 : while (w >= 4)
1348 : {
1349 : __m128i s;
1350 :
1351 0 : s = combine4 ((__m128i*)ps, (__m128i*)pm);
1352 :
1353 0 : save_128_aligned (
1354 : (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1355 :
1356 0 : pd += 4;
1357 0 : ps += 4;
1358 0 : if (pm)
1359 0 : pm += 4;
1360 0 : w -= 4;
1361 : }
1362 :
1363 0 : while (w--)
1364 : {
1365 : s = combine1 (ps, pm);
1366 0 : d = *pd;
1367 :
1368 0 : ps++;
1369 0 : *pd++ = _mm_cvtsi128_si32 (
1370 : _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1371 0 : if (pm)
1372 0 : pm++;
1373 : }
1374 0 : }
1375 :
1376 : static force_inline uint32_t
1377 : core_combine_saturate_u_pixel_sse2 (uint32_t src,
1378 : uint32_t dst)
1379 : {
1380 0 : __m128i ms = unpack_32_1x128 (src);
1381 0 : __m128i md = unpack_32_1x128 (dst);
1382 0 : uint32_t sa = src >> 24;
1383 0 : uint32_t da = ~dst >> 24;
1384 :
1385 0 : if (sa > da)
1386 : {
1387 0 : ms = pix_multiply_1x128 (
1388 0 : ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1389 : }
1390 :
1391 0 : return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1392 : }
1393 :
1394 : static void
1395 0 : sse2_combine_saturate_u (pixman_implementation_t *imp,
1396 : pixman_op_t op,
1397 : uint32_t * pd,
1398 : const uint32_t * ps,
1399 : const uint32_t * pm,
1400 : int w)
1401 : {
1402 : uint32_t s, d;
1403 :
1404 : uint32_t pack_cmp;
1405 : __m128i xmm_src, xmm_dst;
1406 :
1407 0 : while (w && (unsigned long)pd & 15)
1408 : {
1409 : s = combine1 (ps, pm);
1410 0 : d = *pd;
1411 :
1412 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1413 0 : w--;
1414 0 : ps++;
1415 0 : if (pm)
1416 0 : pm++;
1417 : }
1418 :
1419 0 : while (w >= 4)
1420 : {
1421 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
1422 0 : xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1423 :
1424 0 : pack_cmp = _mm_movemask_epi8 (
1425 : _mm_cmpgt_epi32 (
1426 : _mm_srli_epi32 (xmm_src, 24),
1427 : _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1428 :
1429 : /* if some alpha src is grater than respective ~alpha dst */
1430 0 : if (pack_cmp)
1431 : {
1432 0 : s = combine1 (ps++, pm);
1433 0 : d = *pd;
1434 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1435 0 : if (pm)
1436 0 : pm++;
1437 :
1438 0 : s = combine1 (ps++, pm);
1439 0 : d = *pd;
1440 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1441 0 : if (pm)
1442 0 : pm++;
1443 :
1444 0 : s = combine1 (ps++, pm);
1445 0 : d = *pd;
1446 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447 0 : if (pm)
1448 0 : pm++;
1449 :
1450 0 : s = combine1 (ps++, pm);
1451 0 : d = *pd;
1452 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1453 0 : if (pm)
1454 0 : pm++;
1455 : }
1456 : else
1457 : {
1458 0 : save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1459 :
1460 0 : pd += 4;
1461 0 : ps += 4;
1462 0 : if (pm)
1463 0 : pm += 4;
1464 : }
1465 :
1466 0 : w -= 4;
1467 : }
1468 :
1469 0 : while (w--)
1470 : {
1471 : s = combine1 (ps, pm);
1472 0 : d = *pd;
1473 :
1474 0 : *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1475 0 : ps++;
1476 0 : if (pm)
1477 0 : pm++;
1478 : }
1479 0 : }
1480 :
1481 : static void
1482 0 : sse2_combine_src_ca (pixman_implementation_t *imp,
1483 : pixman_op_t op,
1484 : uint32_t * pd,
1485 : const uint32_t * ps,
1486 : const uint32_t * pm,
1487 : int w)
1488 : {
1489 : uint32_t s, m;
1490 :
1491 : __m128i xmm_src_lo, xmm_src_hi;
1492 : __m128i xmm_mask_lo, xmm_mask_hi;
1493 : __m128i xmm_dst_lo, xmm_dst_hi;
1494 :
1495 0 : while (w && (unsigned long)pd & 15)
1496 : {
1497 0 : s = *ps++;
1498 0 : m = *pm++;
1499 0 : *pd++ = pack_1x128_32 (
1500 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1501 0 : w--;
1502 : }
1503 :
1504 0 : while (w >= 4)
1505 : {
1506 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1507 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1508 :
1509 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1510 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1511 :
1512 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1513 : &xmm_mask_lo, &xmm_mask_hi,
1514 : &xmm_dst_lo, &xmm_dst_hi);
1515 :
1516 0 : save_128_aligned (
1517 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1518 :
1519 0 : ps += 4;
1520 0 : pd += 4;
1521 0 : pm += 4;
1522 0 : w -= 4;
1523 : }
1524 :
1525 0 : while (w)
1526 : {
1527 0 : s = *ps++;
1528 0 : m = *pm++;
1529 0 : *pd++ = pack_1x128_32 (
1530 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531 0 : w--;
1532 : }
1533 0 : }
1534 :
1535 : static force_inline uint32_t
1536 : core_combine_over_ca_pixel_sse2 (uint32_t src,
1537 : uint32_t mask,
1538 : uint32_t dst)
1539 : {
1540 0 : __m128i s = unpack_32_1x128 (src);
1541 0 : __m128i expAlpha = expand_alpha_1x128 (s);
1542 0 : __m128i unpk_mask = unpack_32_1x128 (mask);
1543 0 : __m128i unpk_dst = unpack_32_1x128 (dst);
1544 :
1545 : return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1546 : }
1547 :
1548 : static void
1549 0 : sse2_combine_over_ca (pixman_implementation_t *imp,
1550 : pixman_op_t op,
1551 : uint32_t * pd,
1552 : const uint32_t * ps,
1553 : const uint32_t * pm,
1554 : int w)
1555 : {
1556 : uint32_t s, m, d;
1557 :
1558 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1559 : __m128i xmm_src_lo, xmm_src_hi;
1560 : __m128i xmm_dst_lo, xmm_dst_hi;
1561 : __m128i xmm_mask_lo, xmm_mask_hi;
1562 :
1563 0 : while (w && (unsigned long)pd & 15)
1564 : {
1565 0 : s = *ps++;
1566 0 : m = *pm++;
1567 0 : d = *pd;
1568 :
1569 0 : *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1570 0 : w--;
1571 : }
1572 :
1573 0 : while (w >= 4)
1574 : {
1575 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1576 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1577 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1578 :
1579 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1580 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1581 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1582 :
1583 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1584 : &xmm_alpha_lo, &xmm_alpha_hi);
1585 :
1586 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1587 : &xmm_alpha_lo, &xmm_alpha_hi,
1588 : &xmm_mask_lo, &xmm_mask_hi,
1589 : &xmm_dst_lo, &xmm_dst_hi);
1590 :
1591 0 : save_128_aligned (
1592 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1593 :
1594 0 : ps += 4;
1595 0 : pd += 4;
1596 0 : pm += 4;
1597 0 : w -= 4;
1598 : }
1599 :
1600 0 : while (w)
1601 : {
1602 0 : s = *ps++;
1603 0 : m = *pm++;
1604 0 : d = *pd;
1605 :
1606 0 : *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1607 0 : w--;
1608 : }
1609 0 : }
1610 :
1611 : static force_inline uint32_t
1612 : core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1613 : uint32_t mask,
1614 : uint32_t dst)
1615 : {
1616 0 : __m128i d = unpack_32_1x128 (dst);
1617 :
1618 0 : return pack_1x128_32 (
1619 : over_1x128 (d, expand_alpha_1x128 (d),
1620 : pix_multiply_1x128 (unpack_32_1x128 (src),
1621 : unpack_32_1x128 (mask))));
1622 : }
1623 :
1624 : static void
1625 0 : sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1626 : pixman_op_t op,
1627 : uint32_t * pd,
1628 : const uint32_t * ps,
1629 : const uint32_t * pm,
1630 : int w)
1631 : {
1632 : uint32_t s, m, d;
1633 :
1634 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1635 : __m128i xmm_src_lo, xmm_src_hi;
1636 : __m128i xmm_dst_lo, xmm_dst_hi;
1637 : __m128i xmm_mask_lo, xmm_mask_hi;
1638 :
1639 0 : while (w && (unsigned long)pd & 15)
1640 : {
1641 0 : s = *ps++;
1642 0 : m = *pm++;
1643 0 : d = *pd;
1644 :
1645 0 : *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1646 0 : w--;
1647 : }
1648 :
1649 0 : while (w >= 4)
1650 : {
1651 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1652 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1653 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1654 :
1655 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1656 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1657 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1658 :
1659 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1660 : &xmm_alpha_lo, &xmm_alpha_hi);
1661 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1662 : &xmm_mask_lo, &xmm_mask_hi,
1663 : &xmm_mask_lo, &xmm_mask_hi);
1664 :
1665 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1666 : &xmm_alpha_lo, &xmm_alpha_hi,
1667 : &xmm_mask_lo, &xmm_mask_hi);
1668 :
1669 0 : save_128_aligned (
1670 : (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1671 :
1672 0 : ps += 4;
1673 0 : pd += 4;
1674 0 : pm += 4;
1675 0 : w -= 4;
1676 : }
1677 :
1678 0 : while (w)
1679 : {
1680 0 : s = *ps++;
1681 0 : m = *pm++;
1682 0 : d = *pd;
1683 :
1684 0 : *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1685 0 : w--;
1686 : }
1687 0 : }
1688 :
1689 : static void
1690 0 : sse2_combine_in_ca (pixman_implementation_t *imp,
1691 : pixman_op_t op,
1692 : uint32_t * pd,
1693 : const uint32_t * ps,
1694 : const uint32_t * pm,
1695 : int w)
1696 : {
1697 : uint32_t s, m, d;
1698 :
1699 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1700 : __m128i xmm_src_lo, xmm_src_hi;
1701 : __m128i xmm_dst_lo, xmm_dst_hi;
1702 : __m128i xmm_mask_lo, xmm_mask_hi;
1703 :
1704 0 : while (w && (unsigned long)pd & 15)
1705 : {
1706 0 : s = *ps++;
1707 0 : m = *pm++;
1708 0 : d = *pd;
1709 :
1710 0 : *pd++ = pack_1x128_32 (
1711 : pix_multiply_1x128 (
1712 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1713 : expand_alpha_1x128 (unpack_32_1x128 (d))));
1714 :
1715 0 : w--;
1716 : }
1717 :
1718 0 : while (w >= 4)
1719 : {
1720 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1721 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1722 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1723 :
1724 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1725 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1726 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1727 :
1728 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1729 : &xmm_alpha_lo, &xmm_alpha_hi);
1730 :
1731 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1732 : &xmm_mask_lo, &xmm_mask_hi,
1733 : &xmm_dst_lo, &xmm_dst_hi);
1734 :
1735 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1736 : &xmm_alpha_lo, &xmm_alpha_hi,
1737 : &xmm_dst_lo, &xmm_dst_hi);
1738 :
1739 0 : save_128_aligned (
1740 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1741 :
1742 0 : ps += 4;
1743 0 : pd += 4;
1744 0 : pm += 4;
1745 0 : w -= 4;
1746 : }
1747 :
1748 0 : while (w)
1749 : {
1750 0 : s = *ps++;
1751 0 : m = *pm++;
1752 0 : d = *pd;
1753 :
1754 0 : *pd++ = pack_1x128_32 (
1755 : pix_multiply_1x128 (
1756 : pix_multiply_1x128 (
1757 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1758 : expand_alpha_1x128 (unpack_32_1x128 (d))));
1759 :
1760 0 : w--;
1761 : }
1762 0 : }
1763 :
1764 : static void
1765 0 : sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1766 : pixman_op_t op,
1767 : uint32_t * pd,
1768 : const uint32_t * ps,
1769 : const uint32_t * pm,
1770 : int w)
1771 : {
1772 : uint32_t s, m, d;
1773 :
1774 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1775 : __m128i xmm_src_lo, xmm_src_hi;
1776 : __m128i xmm_dst_lo, xmm_dst_hi;
1777 : __m128i xmm_mask_lo, xmm_mask_hi;
1778 :
1779 0 : while (w && (unsigned long)pd & 15)
1780 : {
1781 0 : s = *ps++;
1782 0 : m = *pm++;
1783 0 : d = *pd;
1784 :
1785 0 : *pd++ = pack_1x128_32 (
1786 : pix_multiply_1x128 (
1787 : unpack_32_1x128 (d),
1788 : pix_multiply_1x128 (unpack_32_1x128 (m),
1789 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
1790 0 : w--;
1791 : }
1792 :
1793 0 : while (w >= 4)
1794 : {
1795 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1796 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1797 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1798 :
1799 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1800 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1801 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1802 :
1803 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1804 : &xmm_alpha_lo, &xmm_alpha_hi);
1805 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1806 : &xmm_alpha_lo, &xmm_alpha_hi,
1807 : &xmm_alpha_lo, &xmm_alpha_hi);
1808 :
1809 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1810 : &xmm_alpha_lo, &xmm_alpha_hi,
1811 : &xmm_dst_lo, &xmm_dst_hi);
1812 :
1813 0 : save_128_aligned (
1814 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1815 :
1816 0 : ps += 4;
1817 0 : pd += 4;
1818 0 : pm += 4;
1819 0 : w -= 4;
1820 : }
1821 :
1822 0 : while (w)
1823 : {
1824 0 : s = *ps++;
1825 0 : m = *pm++;
1826 0 : d = *pd;
1827 :
1828 0 : *pd++ = pack_1x128_32 (
1829 : pix_multiply_1x128 (
1830 : unpack_32_1x128 (d),
1831 : pix_multiply_1x128 (unpack_32_1x128 (m),
1832 : expand_alpha_1x128 (unpack_32_1x128 (s)))));
1833 0 : w--;
1834 : }
1835 0 : }
1836 :
1837 : static void
1838 0 : sse2_combine_out_ca (pixman_implementation_t *imp,
1839 : pixman_op_t op,
1840 : uint32_t * pd,
1841 : const uint32_t * ps,
1842 : const uint32_t * pm,
1843 : int w)
1844 : {
1845 : uint32_t s, m, d;
1846 :
1847 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1848 : __m128i xmm_src_lo, xmm_src_hi;
1849 : __m128i xmm_dst_lo, xmm_dst_hi;
1850 : __m128i xmm_mask_lo, xmm_mask_hi;
1851 :
1852 0 : while (w && (unsigned long)pd & 15)
1853 : {
1854 0 : s = *ps++;
1855 0 : m = *pm++;
1856 0 : d = *pd;
1857 :
1858 0 : *pd++ = pack_1x128_32 (
1859 : pix_multiply_1x128 (
1860 : pix_multiply_1x128 (
1861 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1862 : negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1863 0 : w--;
1864 : }
1865 :
1866 0 : while (w >= 4)
1867 : {
1868 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1869 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1870 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1871 :
1872 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1873 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1874 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1875 :
1876 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1877 : &xmm_alpha_lo, &xmm_alpha_hi);
1878 0 : negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1879 : &xmm_alpha_lo, &xmm_alpha_hi);
1880 :
1881 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1882 : &xmm_mask_lo, &xmm_mask_hi,
1883 : &xmm_dst_lo, &xmm_dst_hi);
1884 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1885 : &xmm_alpha_lo, &xmm_alpha_hi,
1886 : &xmm_dst_lo, &xmm_dst_hi);
1887 :
1888 0 : save_128_aligned (
1889 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1890 :
1891 0 : ps += 4;
1892 0 : pd += 4;
1893 0 : pm += 4;
1894 0 : w -= 4;
1895 : }
1896 :
1897 0 : while (w)
1898 : {
1899 0 : s = *ps++;
1900 0 : m = *pm++;
1901 0 : d = *pd;
1902 :
1903 0 : *pd++ = pack_1x128_32 (
1904 : pix_multiply_1x128 (
1905 : pix_multiply_1x128 (
1906 : unpack_32_1x128 (s), unpack_32_1x128 (m)),
1907 : negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1908 :
1909 0 : w--;
1910 : }
1911 0 : }
1912 :
1913 : static void
1914 0 : sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1915 : pixman_op_t op,
1916 : uint32_t * pd,
1917 : const uint32_t * ps,
1918 : const uint32_t * pm,
1919 : int w)
1920 : {
1921 : uint32_t s, m, d;
1922 :
1923 : __m128i xmm_alpha_lo, xmm_alpha_hi;
1924 : __m128i xmm_src_lo, xmm_src_hi;
1925 : __m128i xmm_dst_lo, xmm_dst_hi;
1926 : __m128i xmm_mask_lo, xmm_mask_hi;
1927 :
1928 0 : while (w && (unsigned long)pd & 15)
1929 : {
1930 0 : s = *ps++;
1931 0 : m = *pm++;
1932 0 : d = *pd;
1933 :
1934 0 : *pd++ = pack_1x128_32 (
1935 : pix_multiply_1x128 (
1936 : unpack_32_1x128 (d),
1937 : negate_1x128 (pix_multiply_1x128 (
1938 : unpack_32_1x128 (m),
1939 : expand_alpha_1x128 (unpack_32_1x128 (s))))));
1940 0 : w--;
1941 : }
1942 :
1943 0 : while (w >= 4)
1944 : {
1945 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1946 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1947 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1948 :
1949 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1950 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1951 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1952 :
1953 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1954 : &xmm_alpha_lo, &xmm_alpha_hi);
1955 :
1956 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1957 : &xmm_alpha_lo, &xmm_alpha_hi,
1958 : &xmm_mask_lo, &xmm_mask_hi);
1959 :
1960 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1961 : &xmm_mask_lo, &xmm_mask_hi);
1962 :
1963 : pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1964 : &xmm_mask_lo, &xmm_mask_hi,
1965 : &xmm_dst_lo, &xmm_dst_hi);
1966 :
1967 0 : save_128_aligned (
1968 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1969 :
1970 0 : ps += 4;
1971 0 : pd += 4;
1972 0 : pm += 4;
1973 0 : w -= 4;
1974 : }
1975 :
1976 0 : while (w)
1977 : {
1978 0 : s = *ps++;
1979 0 : m = *pm++;
1980 0 : d = *pd;
1981 :
1982 0 : *pd++ = pack_1x128_32 (
1983 : pix_multiply_1x128 (
1984 : unpack_32_1x128 (d),
1985 : negate_1x128 (pix_multiply_1x128 (
1986 : unpack_32_1x128 (m),
1987 : expand_alpha_1x128 (unpack_32_1x128 (s))))));
1988 0 : w--;
1989 : }
1990 0 : }
1991 :
1992 : static force_inline uint32_t
1993 : core_combine_atop_ca_pixel_sse2 (uint32_t src,
1994 : uint32_t mask,
1995 : uint32_t dst)
1996 : {
1997 0 : __m128i m = unpack_32_1x128 (mask);
1998 0 : __m128i s = unpack_32_1x128 (src);
1999 0 : __m128i d = unpack_32_1x128 (dst);
2000 0 : __m128i sa = expand_alpha_1x128 (s);
2001 0 : __m128i da = expand_alpha_1x128 (d);
2002 :
2003 0 : s = pix_multiply_1x128 (s, m);
2004 0 : m = negate_1x128 (pix_multiply_1x128 (m, sa));
2005 :
2006 : return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2007 : }
2008 :
2009 : static void
2010 0 : sse2_combine_atop_ca (pixman_implementation_t *imp,
2011 : pixman_op_t op,
2012 : uint32_t * pd,
2013 : const uint32_t * ps,
2014 : const uint32_t * pm,
2015 : int w)
2016 : {
2017 : uint32_t s, m, d;
2018 :
2019 : __m128i xmm_src_lo, xmm_src_hi;
2020 : __m128i xmm_dst_lo, xmm_dst_hi;
2021 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2022 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2023 : __m128i xmm_mask_lo, xmm_mask_hi;
2024 :
2025 0 : while (w && (unsigned long)pd & 15)
2026 : {
2027 0 : s = *ps++;
2028 0 : m = *pm++;
2029 0 : d = *pd;
2030 :
2031 0 : *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2032 0 : w--;
2033 : }
2034 :
2035 0 : while (w >= 4)
2036 : {
2037 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2038 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2039 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2040 :
2041 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2042 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2043 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2044 :
2045 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2046 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2047 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2048 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2049 :
2050 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2051 : &xmm_mask_lo, &xmm_mask_hi,
2052 : &xmm_src_lo, &xmm_src_hi);
2053 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2054 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2055 : &xmm_mask_lo, &xmm_mask_hi);
2056 :
2057 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2058 :
2059 : pix_add_multiply_2x128 (
2060 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2061 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2062 : &xmm_dst_lo, &xmm_dst_hi);
2063 :
2064 0 : save_128_aligned (
2065 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2066 :
2067 0 : ps += 4;
2068 0 : pd += 4;
2069 0 : pm += 4;
2070 0 : w -= 4;
2071 : }
2072 :
2073 0 : while (w)
2074 : {
2075 0 : s = *ps++;
2076 0 : m = *pm++;
2077 0 : d = *pd;
2078 :
2079 0 : *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2080 0 : w--;
2081 : }
2082 0 : }
2083 :
2084 : static force_inline uint32_t
2085 : core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2086 : uint32_t mask,
2087 : uint32_t dst)
2088 : {
2089 0 : __m128i m = unpack_32_1x128 (mask);
2090 0 : __m128i s = unpack_32_1x128 (src);
2091 0 : __m128i d = unpack_32_1x128 (dst);
2092 :
2093 0 : __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2094 0 : __m128i sa = expand_alpha_1x128 (s);
2095 :
2096 0 : s = pix_multiply_1x128 (s, m);
2097 0 : m = pix_multiply_1x128 (m, sa);
2098 :
2099 : return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2100 : }
2101 :
2102 : static void
2103 0 : sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2104 : pixman_op_t op,
2105 : uint32_t * pd,
2106 : const uint32_t * ps,
2107 : const uint32_t * pm,
2108 : int w)
2109 : {
2110 : uint32_t s, m, d;
2111 :
2112 : __m128i xmm_src_lo, xmm_src_hi;
2113 : __m128i xmm_dst_lo, xmm_dst_hi;
2114 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2115 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2116 : __m128i xmm_mask_lo, xmm_mask_hi;
2117 :
2118 0 : while (w && (unsigned long)pd & 15)
2119 : {
2120 0 : s = *ps++;
2121 0 : m = *pm++;
2122 0 : d = *pd;
2123 :
2124 0 : *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2125 0 : w--;
2126 : }
2127 :
2128 0 : while (w >= 4)
2129 : {
2130 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2131 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2132 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2133 :
2134 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2135 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2136 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2137 :
2138 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2139 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2140 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2141 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2142 :
2143 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2144 : &xmm_mask_lo, &xmm_mask_hi,
2145 : &xmm_src_lo, &xmm_src_hi);
2146 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2147 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2148 : &xmm_mask_lo, &xmm_mask_hi);
2149 :
2150 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2151 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2152 :
2153 : pix_add_multiply_2x128 (
2154 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2155 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2156 : &xmm_dst_lo, &xmm_dst_hi);
2157 :
2158 0 : save_128_aligned (
2159 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2160 :
2161 0 : ps += 4;
2162 0 : pd += 4;
2163 0 : pm += 4;
2164 0 : w -= 4;
2165 : }
2166 :
2167 0 : while (w)
2168 : {
2169 0 : s = *ps++;
2170 0 : m = *pm++;
2171 0 : d = *pd;
2172 :
2173 0 : *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2174 0 : w--;
2175 : }
2176 0 : }
2177 :
2178 : static force_inline uint32_t
2179 : core_combine_xor_ca_pixel_sse2 (uint32_t src,
2180 : uint32_t mask,
2181 : uint32_t dst)
2182 : {
2183 0 : __m128i a = unpack_32_1x128 (mask);
2184 0 : __m128i s = unpack_32_1x128 (src);
2185 0 : __m128i d = unpack_32_1x128 (dst);
2186 :
2187 0 : __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2188 : a, expand_alpha_1x128 (s)));
2189 0 : __m128i dest = pix_multiply_1x128 (s, a);
2190 0 : __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2191 :
2192 : return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2193 : &alpha_dst,
2194 : &dest,
2195 : &alpha_src));
2196 : }
2197 :
2198 : static void
2199 0 : sse2_combine_xor_ca (pixman_implementation_t *imp,
2200 : pixman_op_t op,
2201 : uint32_t * pd,
2202 : const uint32_t * ps,
2203 : const uint32_t * pm,
2204 : int w)
2205 : {
2206 : uint32_t s, m, d;
2207 :
2208 : __m128i xmm_src_lo, xmm_src_hi;
2209 : __m128i xmm_dst_lo, xmm_dst_hi;
2210 : __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2211 : __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2212 : __m128i xmm_mask_lo, xmm_mask_hi;
2213 :
2214 0 : while (w && (unsigned long)pd & 15)
2215 : {
2216 0 : s = *ps++;
2217 0 : m = *pm++;
2218 0 : d = *pd;
2219 :
2220 0 : *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2221 0 : w--;
2222 : }
2223 :
2224 0 : while (w >= 4)
2225 : {
2226 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2227 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2228 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2229 :
2230 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2231 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2232 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2233 :
2234 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2235 : &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2236 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2237 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2238 :
2239 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2240 : &xmm_mask_lo, &xmm_mask_hi,
2241 : &xmm_src_lo, &xmm_src_hi);
2242 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2243 : &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2244 : &xmm_mask_lo, &xmm_mask_hi);
2245 :
2246 0 : negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2247 : &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2248 0 : negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2249 : &xmm_mask_lo, &xmm_mask_hi);
2250 :
2251 : pix_add_multiply_2x128 (
2252 : &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2253 : &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2254 : &xmm_dst_lo, &xmm_dst_hi);
2255 :
2256 0 : save_128_aligned (
2257 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2258 :
2259 0 : ps += 4;
2260 0 : pd += 4;
2261 0 : pm += 4;
2262 0 : w -= 4;
2263 : }
2264 :
2265 0 : while (w)
2266 : {
2267 0 : s = *ps++;
2268 0 : m = *pm++;
2269 0 : d = *pd;
2270 :
2271 0 : *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2272 0 : w--;
2273 : }
2274 0 : }
2275 :
2276 : static void
2277 0 : sse2_combine_add_ca (pixman_implementation_t *imp,
2278 : pixman_op_t op,
2279 : uint32_t * pd,
2280 : const uint32_t * ps,
2281 : const uint32_t * pm,
2282 : int w)
2283 : {
2284 : uint32_t s, m, d;
2285 :
2286 : __m128i xmm_src_lo, xmm_src_hi;
2287 : __m128i xmm_dst_lo, xmm_dst_hi;
2288 : __m128i xmm_mask_lo, xmm_mask_hi;
2289 :
2290 0 : while (w && (unsigned long)pd & 15)
2291 : {
2292 0 : s = *ps++;
2293 0 : m = *pm++;
2294 0 : d = *pd;
2295 :
2296 0 : *pd++ = pack_1x128_32 (
2297 : _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2298 : unpack_32_1x128 (m)),
2299 : unpack_32_1x128 (d)));
2300 0 : w--;
2301 : }
2302 :
2303 0 : while (w >= 4)
2304 : {
2305 0 : xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2306 0 : xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2307 0 : xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2308 :
2309 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2310 0 : unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2311 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2312 :
2313 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2314 : &xmm_mask_lo, &xmm_mask_hi,
2315 : &xmm_src_lo, &xmm_src_hi);
2316 :
2317 0 : save_128_aligned (
2318 : (__m128i*)pd, pack_2x128_128 (
2319 : _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2320 : _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2321 :
2322 0 : ps += 4;
2323 0 : pd += 4;
2324 0 : pm += 4;
2325 0 : w -= 4;
2326 : }
2327 :
2328 0 : while (w)
2329 : {
2330 0 : s = *ps++;
2331 0 : m = *pm++;
2332 0 : d = *pd;
2333 :
2334 0 : *pd++ = pack_1x128_32 (
2335 : _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2336 : unpack_32_1x128 (m)),
2337 : unpack_32_1x128 (d)));
2338 0 : w--;
2339 : }
2340 0 : }
2341 :
2342 : static force_inline __m128i
2343 : create_mask_16_128 (uint16_t mask)
2344 : {
2345 16 : return _mm_set1_epi16 (mask);
2346 : }
2347 :
2348 : /* Work around a code generation bug in Sun Studio 12. */
2349 : #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2350 : # define create_mask_2x32_128(mask0, mask1) \
2351 : (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2352 : #else
2353 : static force_inline __m128i
2354 : create_mask_2x32_128 (uint32_t mask0,
2355 : uint32_t mask1)
2356 : {
2357 61 : return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2358 : }
2359 : #endif
2360 :
2361 : static void
2362 0 : sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2363 : pixman_op_t op,
2364 : pixman_image_t * src_image,
2365 : pixman_image_t * mask_image,
2366 : pixman_image_t * dst_image,
2367 : int32_t src_x,
2368 : int32_t src_y,
2369 : int32_t mask_x,
2370 : int32_t mask_y,
2371 : int32_t dest_x,
2372 : int32_t dest_y,
2373 : int32_t width,
2374 : int32_t height)
2375 : {
2376 : uint32_t src;
2377 : uint32_t *dst_line, *dst, d;
2378 : int32_t w;
2379 : int dst_stride;
2380 : __m128i xmm_src, xmm_alpha;
2381 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2382 :
2383 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2384 :
2385 0 : if (src == 0)
2386 0 : return;
2387 :
2388 0 : PIXMAN_IMAGE_GET_LINE (
2389 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2390 :
2391 0 : xmm_src = expand_pixel_32_1x128 (src);
2392 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2393 :
2394 0 : while (height--)
2395 : {
2396 0 : dst = dst_line;
2397 :
2398 0 : dst_line += dst_stride;
2399 0 : w = width;
2400 :
2401 0 : while (w && (unsigned long)dst & 15)
2402 : {
2403 0 : d = *dst;
2404 0 : *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2405 : xmm_alpha,
2406 : unpack_32_1x128 (d)));
2407 0 : w--;
2408 : }
2409 :
2410 0 : while (w >= 4)
2411 : {
2412 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2413 :
2414 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2415 :
2416 : over_2x128 (&xmm_src, &xmm_src,
2417 : &xmm_alpha, &xmm_alpha,
2418 : &xmm_dst_lo, &xmm_dst_hi);
2419 :
2420 : /* rebuid the 4 pixel data and save*/
2421 0 : save_128_aligned (
2422 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2423 :
2424 0 : w -= 4;
2425 0 : dst += 4;
2426 : }
2427 :
2428 0 : while (w)
2429 : {
2430 0 : d = *dst;
2431 0 : *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2432 : xmm_alpha,
2433 : unpack_32_1x128 (d)));
2434 0 : w--;
2435 : }
2436 :
2437 : }
2438 : }
2439 :
2440 : static void
2441 0 : sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2442 : pixman_op_t op,
2443 : pixman_image_t * src_image,
2444 : pixman_image_t * mask_image,
2445 : pixman_image_t * dst_image,
2446 : int32_t src_x,
2447 : int32_t src_y,
2448 : int32_t mask_x,
2449 : int32_t mask_y,
2450 : int32_t dest_x,
2451 : int32_t dest_y,
2452 : int32_t width,
2453 : int32_t height)
2454 : {
2455 : uint32_t src;
2456 : uint16_t *dst_line, *dst, d;
2457 : int32_t w;
2458 : int dst_stride;
2459 : __m128i xmm_src, xmm_alpha;
2460 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2461 :
2462 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2463 :
2464 0 : if (src == 0)
2465 0 : return;
2466 :
2467 0 : PIXMAN_IMAGE_GET_LINE (
2468 : dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2469 :
2470 0 : xmm_src = expand_pixel_32_1x128 (src);
2471 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2472 :
2473 0 : while (height--)
2474 : {
2475 0 : dst = dst_line;
2476 :
2477 0 : dst_line += dst_stride;
2478 0 : w = width;
2479 :
2480 0 : while (w && (unsigned long)dst & 15)
2481 : {
2482 0 : d = *dst;
2483 :
2484 0 : *dst++ = pack_565_32_16 (
2485 : pack_1x128_32 (over_1x128 (xmm_src,
2486 : xmm_alpha,
2487 : expand565_16_1x128 (d))));
2488 0 : w--;
2489 : }
2490 :
2491 0 : while (w >= 8)
2492 : {
2493 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2494 :
2495 0 : unpack_565_128_4x128 (xmm_dst,
2496 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2497 :
2498 : over_2x128 (&xmm_src, &xmm_src,
2499 : &xmm_alpha, &xmm_alpha,
2500 : &xmm_dst0, &xmm_dst1);
2501 : over_2x128 (&xmm_src, &xmm_src,
2502 : &xmm_alpha, &xmm_alpha,
2503 : &xmm_dst2, &xmm_dst3);
2504 :
2505 0 : xmm_dst = pack_565_4x128_128 (
2506 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 :
2508 0 : save_128_aligned ((__m128i*)dst, xmm_dst);
2509 :
2510 0 : dst += 8;
2511 0 : w -= 8;
2512 : }
2513 :
2514 0 : while (w--)
2515 : {
2516 0 : d = *dst;
2517 0 : *dst++ = pack_565_32_16 (
2518 : pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2519 : expand565_16_1x128 (d))));
2520 : }
2521 : }
2522 :
2523 : }
2524 :
2525 : static void
2526 0 : sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2527 : pixman_op_t op,
2528 : pixman_image_t * src_image,
2529 : pixman_image_t * mask_image,
2530 : pixman_image_t * dst_image,
2531 : int32_t src_x,
2532 : int32_t src_y,
2533 : int32_t mask_x,
2534 : int32_t mask_y,
2535 : int32_t dest_x,
2536 : int32_t dest_y,
2537 : int32_t width,
2538 : int32_t height)
2539 : {
2540 : uint32_t src, srca;
2541 : uint32_t *dst_line, d;
2542 : uint32_t *mask_line, m;
2543 : uint32_t pack_cmp;
2544 : int dst_stride, mask_stride;
2545 :
2546 : __m128i xmm_src, xmm_alpha;
2547 : __m128i xmm_dst;
2548 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 :
2550 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2551 :
2552 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2553 0 : srca = src >> 24;
2554 :
2555 0 : if (src == 0)
2556 0 : return;
2557 :
2558 0 : PIXMAN_IMAGE_GET_LINE (
2559 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2560 0 : PIXMAN_IMAGE_GET_LINE (
2561 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2562 :
2563 0 : xmm_src = _mm_unpacklo_epi8 (
2564 : create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2565 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2566 0 : mmx_src = xmm_src;
2567 0 : mmx_alpha = xmm_alpha;
2568 :
2569 0 : while (height--)
2570 : {
2571 0 : int w = width;
2572 0 : const uint32_t *pm = (uint32_t *)mask_line;
2573 0 : uint32_t *pd = (uint32_t *)dst_line;
2574 :
2575 0 : dst_line += dst_stride;
2576 0 : mask_line += mask_stride;
2577 :
2578 0 : while (w && (unsigned long)pd & 15)
2579 : {
2580 0 : m = *pm++;
2581 :
2582 0 : if (m)
2583 : {
2584 0 : d = *pd;
2585 :
2586 0 : mmx_mask = unpack_32_1x128 (m);
2587 0 : mmx_dest = unpack_32_1x128 (d);
2588 :
2589 0 : *pd = pack_1x128_32 (
2590 : _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2591 : mmx_dest));
2592 : }
2593 :
2594 0 : pd++;
2595 0 : w--;
2596 : }
2597 :
2598 0 : while (w >= 4)
2599 : {
2600 0 : xmm_mask = load_128_unaligned ((__m128i*)pm);
2601 :
2602 0 : pack_cmp =
2603 0 : _mm_movemask_epi8 (
2604 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2605 :
2606 : /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2607 0 : if (pack_cmp != 0xffff)
2608 : {
2609 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
2610 :
2611 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2612 :
2613 : pix_multiply_2x128 (&xmm_src, &xmm_src,
2614 : &xmm_mask_lo, &xmm_mask_hi,
2615 : &xmm_mask_lo, &xmm_mask_hi);
2616 0 : xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2617 :
2618 0 : save_128_aligned (
2619 : (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2620 : }
2621 :
2622 0 : pd += 4;
2623 0 : pm += 4;
2624 0 : w -= 4;
2625 : }
2626 :
2627 0 : while (w)
2628 : {
2629 0 : m = *pm++;
2630 :
2631 0 : if (m)
2632 : {
2633 0 : d = *pd;
2634 :
2635 0 : mmx_mask = unpack_32_1x128 (m);
2636 0 : mmx_dest = unpack_32_1x128 (d);
2637 :
2638 0 : *pd = pack_1x128_32 (
2639 : _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2640 : mmx_dest));
2641 : }
2642 :
2643 0 : pd++;
2644 0 : w--;
2645 : }
2646 : }
2647 :
2648 : }
2649 :
2650 : static void
2651 0 : sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2652 : pixman_op_t op,
2653 : pixman_image_t * src_image,
2654 : pixman_image_t * mask_image,
2655 : pixman_image_t * dst_image,
2656 : int32_t src_x,
2657 : int32_t src_y,
2658 : int32_t mask_x,
2659 : int32_t mask_y,
2660 : int32_t dest_x,
2661 : int32_t dest_y,
2662 : int32_t width,
2663 : int32_t height)
2664 : {
2665 : uint32_t src;
2666 : uint32_t *dst_line, d;
2667 : uint32_t *mask_line, m;
2668 : uint32_t pack_cmp;
2669 : int dst_stride, mask_stride;
2670 :
2671 : __m128i xmm_src, xmm_alpha;
2672 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2673 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2674 :
2675 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2676 :
2677 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2678 :
2679 0 : if (src == 0)
2680 0 : return;
2681 :
2682 0 : PIXMAN_IMAGE_GET_LINE (
2683 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2684 0 : PIXMAN_IMAGE_GET_LINE (
2685 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2686 :
2687 0 : xmm_src = _mm_unpacklo_epi8 (
2688 : create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2689 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
2690 0 : mmx_src = xmm_src;
2691 0 : mmx_alpha = xmm_alpha;
2692 :
2693 0 : while (height--)
2694 : {
2695 0 : int w = width;
2696 0 : const uint32_t *pm = (uint32_t *)mask_line;
2697 0 : uint32_t *pd = (uint32_t *)dst_line;
2698 :
2699 0 : dst_line += dst_stride;
2700 0 : mask_line += mask_stride;
2701 :
2702 0 : while (w && (unsigned long)pd & 15)
2703 : {
2704 0 : m = *pm++;
2705 :
2706 0 : if (m)
2707 : {
2708 0 : d = *pd;
2709 0 : mmx_mask = unpack_32_1x128 (m);
2710 0 : mmx_dest = unpack_32_1x128 (d);
2711 :
2712 0 : *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2713 : &mmx_alpha,
2714 : &mmx_mask,
2715 : &mmx_dest));
2716 : }
2717 :
2718 0 : pd++;
2719 0 : w--;
2720 : }
2721 :
2722 0 : while (w >= 4)
2723 : {
2724 0 : xmm_mask = load_128_unaligned ((__m128i*)pm);
2725 :
2726 0 : pack_cmp =
2727 0 : _mm_movemask_epi8 (
2728 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2729 :
2730 : /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2731 0 : if (pack_cmp != 0xffff)
2732 : {
2733 0 : xmm_dst = load_128_aligned ((__m128i*)pd);
2734 :
2735 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2736 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2737 :
2738 : in_over_2x128 (&xmm_src, &xmm_src,
2739 : &xmm_alpha, &xmm_alpha,
2740 : &xmm_mask_lo, &xmm_mask_hi,
2741 : &xmm_dst_lo, &xmm_dst_hi);
2742 :
2743 0 : save_128_aligned (
2744 : (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2745 : }
2746 :
2747 0 : pd += 4;
2748 0 : pm += 4;
2749 0 : w -= 4;
2750 : }
2751 :
2752 0 : while (w)
2753 : {
2754 0 : m = *pm++;
2755 :
2756 0 : if (m)
2757 : {
2758 0 : d = *pd;
2759 0 : mmx_mask = unpack_32_1x128 (m);
2760 0 : mmx_dest = unpack_32_1x128 (d);
2761 :
2762 0 : *pd = pack_1x128_32 (
2763 : in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2764 : }
2765 :
2766 0 : pd++;
2767 0 : w--;
2768 : }
2769 : }
2770 :
2771 : }
2772 :
2773 : static void
2774 0 : sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2775 : pixman_op_t op,
2776 : pixman_image_t * src_image,
2777 : pixman_image_t * mask_image,
2778 : pixman_image_t * dst_image,
2779 : int32_t src_x,
2780 : int32_t src_y,
2781 : int32_t mask_x,
2782 : int32_t mask_y,
2783 : int32_t dest_x,
2784 : int32_t dest_y,
2785 : int32_t width,
2786 : int32_t height)
2787 : {
2788 : uint32_t *dst_line, *dst;
2789 : uint32_t *src_line, *src;
2790 : uint32_t mask;
2791 : int32_t w;
2792 : int dst_stride, src_stride;
2793 :
2794 : __m128i xmm_mask;
2795 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2796 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2797 : __m128i xmm_alpha_lo, xmm_alpha_hi;
2798 :
2799 0 : PIXMAN_IMAGE_GET_LINE (
2800 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2801 0 : PIXMAN_IMAGE_GET_LINE (
2802 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2803 :
2804 0 : mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2805 :
2806 0 : xmm_mask = create_mask_16_128 (mask >> 24);
2807 :
2808 0 : while (height--)
2809 : {
2810 0 : dst = dst_line;
2811 0 : dst_line += dst_stride;
2812 0 : src = src_line;
2813 0 : src_line += src_stride;
2814 0 : w = width;
2815 :
2816 0 : while (w && (unsigned long)dst & 15)
2817 : {
2818 0 : uint32_t s = *src++;
2819 :
2820 0 : if (s)
2821 : {
2822 0 : uint32_t d = *dst;
2823 :
2824 0 : __m128i ms = unpack_32_1x128 (s);
2825 0 : __m128i alpha = expand_alpha_1x128 (ms);
2826 0 : __m128i dest = xmm_mask;
2827 0 : __m128i alpha_dst = unpack_32_1x128 (d);
2828 :
2829 0 : *dst = pack_1x128_32 (
2830 : in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2831 : }
2832 0 : dst++;
2833 0 : w--;
2834 : }
2835 :
2836 0 : while (w >= 4)
2837 : {
2838 0 : xmm_src = load_128_unaligned ((__m128i*)src);
2839 :
2840 0 : if (!is_zero (xmm_src))
2841 : {
2842 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
2843 :
2844 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2845 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2846 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2847 : &xmm_alpha_lo, &xmm_alpha_hi);
2848 :
2849 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2850 : &xmm_alpha_lo, &xmm_alpha_hi,
2851 : &xmm_mask, &xmm_mask,
2852 : &xmm_dst_lo, &xmm_dst_hi);
2853 :
2854 0 : save_128_aligned (
2855 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2856 : }
2857 :
2858 0 : dst += 4;
2859 0 : src += 4;
2860 0 : w -= 4;
2861 : }
2862 :
2863 0 : while (w)
2864 : {
2865 0 : uint32_t s = *src++;
2866 :
2867 0 : if (s)
2868 : {
2869 0 : uint32_t d = *dst;
2870 :
2871 0 : __m128i ms = unpack_32_1x128 (s);
2872 0 : __m128i alpha = expand_alpha_1x128 (ms);
2873 0 : __m128i mask = xmm_mask;
2874 0 : __m128i dest = unpack_32_1x128 (d);
2875 :
2876 0 : *dst = pack_1x128_32 (
2877 : in_over_1x128 (&ms, &alpha, &mask, &dest));
2878 : }
2879 :
2880 0 : dst++;
2881 0 : w--;
2882 : }
2883 : }
2884 :
2885 0 : }
2886 :
2887 : static void
2888 10 : sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2889 : pixman_op_t op,
2890 : pixman_image_t * src_image,
2891 : pixman_image_t * mask_image,
2892 : pixman_image_t * dst_image,
2893 : int32_t src_x,
2894 : int32_t src_y,
2895 : int32_t mask_x,
2896 : int32_t mask_y,
2897 : int32_t dest_x,
2898 : int32_t dest_y,
2899 : int32_t width,
2900 : int32_t height)
2901 : {
2902 : uint32_t *dst_line, *dst;
2903 : uint32_t *src_line, *src;
2904 : int32_t w;
2905 : int dst_stride, src_stride;
2906 :
2907 :
2908 10 : PIXMAN_IMAGE_GET_LINE (
2909 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2910 10 : PIXMAN_IMAGE_GET_LINE (
2911 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2912 :
2913 411 : while (height--)
2914 : {
2915 391 : dst = dst_line;
2916 391 : dst_line += dst_stride;
2917 391 : src = src_line;
2918 391 : src_line += src_stride;
2919 391 : w = width;
2920 :
2921 1022 : while (w && (unsigned long)dst & 15)
2922 : {
2923 240 : *dst++ = *src++ | 0xff000000;
2924 240 : w--;
2925 : }
2926 :
2927 1260 : while (w >= 16)
2928 : {
2929 : __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2930 :
2931 956 : xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2932 956 : xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2933 956 : xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2934 956 : xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2935 :
2936 956 : save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2937 956 : save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2938 956 : save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2939 956 : save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2940 :
2941 478 : dst += 16;
2942 478 : src += 16;
2943 478 : w -= 16;
2944 : }
2945 :
2946 1038 : while (w)
2947 : {
2948 256 : *dst++ = *src++ | 0xff000000;
2949 256 : w--;
2950 : }
2951 : }
2952 :
2953 10 : }
2954 :
2955 : static void
2956 0 : sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2957 : pixman_op_t op,
2958 : pixman_image_t * src_image,
2959 : pixman_image_t * mask_image,
2960 : pixman_image_t * dst_image,
2961 : int32_t src_x,
2962 : int32_t src_y,
2963 : int32_t mask_x,
2964 : int32_t mask_y,
2965 : int32_t dest_x,
2966 : int32_t dest_y,
2967 : int32_t width,
2968 : int32_t height)
2969 : {
2970 : uint32_t *dst_line, *dst;
2971 : uint32_t *src_line, *src;
2972 : uint32_t mask;
2973 : int dst_stride, src_stride;
2974 : int32_t w;
2975 :
2976 : __m128i xmm_mask, xmm_alpha;
2977 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2978 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2979 :
2980 0 : PIXMAN_IMAGE_GET_LINE (
2981 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2982 0 : PIXMAN_IMAGE_GET_LINE (
2983 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2984 :
2985 0 : mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2986 :
2987 0 : xmm_mask = create_mask_16_128 (mask >> 24);
2988 0 : xmm_alpha = mask_00ff;
2989 :
2990 0 : while (height--)
2991 : {
2992 0 : dst = dst_line;
2993 0 : dst_line += dst_stride;
2994 0 : src = src_line;
2995 0 : src_line += src_stride;
2996 0 : w = width;
2997 :
2998 0 : while (w && (unsigned long)dst & 15)
2999 : {
3000 0 : uint32_t s = (*src++) | 0xff000000;
3001 0 : uint32_t d = *dst;
3002 :
3003 0 : __m128i src = unpack_32_1x128 (s);
3004 0 : __m128i alpha = xmm_alpha;
3005 0 : __m128i mask = xmm_mask;
3006 0 : __m128i dest = unpack_32_1x128 (d);
3007 :
3008 0 : *dst++ = pack_1x128_32 (
3009 : in_over_1x128 (&src, &alpha, &mask, &dest));
3010 :
3011 0 : w--;
3012 : }
3013 :
3014 0 : while (w >= 4)
3015 : {
3016 0 : xmm_src = _mm_or_si128 (
3017 : load_128_unaligned ((__m128i*)src), mask_ff000000);
3018 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
3019 :
3020 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3021 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3022 :
3023 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3024 : &xmm_alpha, &xmm_alpha,
3025 : &xmm_mask, &xmm_mask,
3026 : &xmm_dst_lo, &xmm_dst_hi);
3027 :
3028 0 : save_128_aligned (
3029 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3030 :
3031 0 : dst += 4;
3032 0 : src += 4;
3033 0 : w -= 4;
3034 :
3035 : }
3036 :
3037 0 : while (w)
3038 : {
3039 0 : uint32_t s = (*src++) | 0xff000000;
3040 0 : uint32_t d = *dst;
3041 :
3042 0 : __m128i src = unpack_32_1x128 (s);
3043 0 : __m128i alpha = xmm_alpha;
3044 0 : __m128i mask = xmm_mask;
3045 0 : __m128i dest = unpack_32_1x128 (d);
3046 :
3047 0 : *dst++ = pack_1x128_32 (
3048 : in_over_1x128 (&src, &alpha, &mask, &dest));
3049 :
3050 0 : w--;
3051 : }
3052 : }
3053 :
3054 0 : }
3055 :
3056 : static void
3057 0 : sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3058 : pixman_op_t op,
3059 : pixman_image_t * src_image,
3060 : pixman_image_t * mask_image,
3061 : pixman_image_t * dst_image,
3062 : int32_t src_x,
3063 : int32_t src_y,
3064 : int32_t mask_x,
3065 : int32_t mask_y,
3066 : int32_t dest_x,
3067 : int32_t dest_y,
3068 : int32_t width,
3069 : int32_t height)
3070 : {
3071 : int dst_stride, src_stride;
3072 : uint32_t *dst_line, *dst;
3073 : uint32_t *src_line, *src;
3074 :
3075 0 : PIXMAN_IMAGE_GET_LINE (
3076 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3077 0 : PIXMAN_IMAGE_GET_LINE (
3078 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3079 :
3080 0 : dst = dst_line;
3081 0 : src = src_line;
3082 :
3083 0 : while (height--)
3084 : {
3085 : sse2_combine_over_u (imp, op, dst, src, NULL, width);
3086 :
3087 0 : dst += dst_stride;
3088 0 : src += src_stride;
3089 : }
3090 0 : }
3091 :
3092 : static force_inline uint16_t
3093 : composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3094 : {
3095 : __m128i ms;
3096 :
3097 0 : ms = unpack_32_1x128 (src);
3098 0 : return pack_565_32_16 (
3099 : pack_1x128_32 (
3100 : over_1x128 (
3101 : ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3102 : }
3103 :
3104 : static void
3105 0 : sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3106 : pixman_op_t op,
3107 : pixman_image_t * src_image,
3108 : pixman_image_t * mask_image,
3109 : pixman_image_t * dst_image,
3110 : int32_t src_x,
3111 : int32_t src_y,
3112 : int32_t mask_x,
3113 : int32_t mask_y,
3114 : int32_t dest_x,
3115 : int32_t dest_y,
3116 : int32_t width,
3117 : int32_t height)
3118 : {
3119 : uint16_t *dst_line, *dst, d;
3120 : uint32_t *src_line, *src, s;
3121 : int dst_stride, src_stride;
3122 : int32_t w;
3123 :
3124 : __m128i xmm_alpha_lo, xmm_alpha_hi;
3125 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3126 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3127 :
3128 0 : PIXMAN_IMAGE_GET_LINE (
3129 : dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3130 0 : PIXMAN_IMAGE_GET_LINE (
3131 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3132 :
3133 0 : while (height--)
3134 : {
3135 0 : dst = dst_line;
3136 0 : src = src_line;
3137 :
3138 0 : dst_line += dst_stride;
3139 0 : src_line += src_stride;
3140 0 : w = width;
3141 :
3142 : /* Align dst on a 16-byte boundary */
3143 0 : while (w &&
3144 0 : ((unsigned long)dst & 15))
3145 : {
3146 0 : s = *src++;
3147 0 : d = *dst;
3148 :
3149 0 : *dst++ = composite_over_8888_0565pixel (s, d);
3150 0 : w--;
3151 : }
3152 :
3153 : /* It's a 8 pixel loop */
3154 0 : while (w >= 8)
3155 : {
3156 : /* I'm loading unaligned because I'm not sure
3157 : * about the address alignment.
3158 : */
3159 0 : xmm_src = load_128_unaligned ((__m128i*) src);
3160 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3161 :
3162 : /* Unpacking */
3163 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164 0 : unpack_565_128_4x128 (xmm_dst,
3165 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3166 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3167 : &xmm_alpha_lo, &xmm_alpha_hi);
3168 :
3169 : /* I'm loading next 4 pixels from memory
3170 : * before to optimze the memory read.
3171 : */
3172 0 : xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3173 :
3174 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
3175 : &xmm_alpha_lo, &xmm_alpha_hi,
3176 : &xmm_dst0, &xmm_dst1);
3177 :
3178 : /* Unpacking */
3179 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3180 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3181 : &xmm_alpha_lo, &xmm_alpha_hi);
3182 :
3183 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
3184 : &xmm_alpha_lo, &xmm_alpha_hi,
3185 : &xmm_dst2, &xmm_dst3);
3186 :
3187 0 : save_128_aligned (
3188 : (__m128i*)dst, pack_565_4x128_128 (
3189 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3190 :
3191 0 : w -= 8;
3192 0 : dst += 8;
3193 0 : src += 8;
3194 : }
3195 :
3196 0 : while (w--)
3197 : {
3198 0 : s = *src++;
3199 0 : d = *dst;
3200 :
3201 0 : *dst++ = composite_over_8888_0565pixel (s, d);
3202 : }
3203 : }
3204 :
3205 0 : }
3206 :
3207 : static void
3208 0 : sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3209 : pixman_op_t op,
3210 : pixman_image_t * src_image,
3211 : pixman_image_t * mask_image,
3212 : pixman_image_t * dst_image,
3213 : int32_t src_x,
3214 : int32_t src_y,
3215 : int32_t mask_x,
3216 : int32_t mask_y,
3217 : int32_t dest_x,
3218 : int32_t dest_y,
3219 : int32_t width,
3220 : int32_t height)
3221 : {
3222 : uint32_t src, srca;
3223 : uint32_t *dst_line, *dst;
3224 : uint8_t *mask_line, *mask;
3225 : int dst_stride, mask_stride;
3226 : int32_t w;
3227 : uint32_t m, d;
3228 :
3229 : __m128i xmm_src, xmm_alpha, xmm_def;
3230 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3231 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3232 :
3233 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3234 :
3235 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3236 :
3237 0 : srca = src >> 24;
3238 0 : if (src == 0)
3239 0 : return;
3240 :
3241 0 : PIXMAN_IMAGE_GET_LINE (
3242 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3243 0 : PIXMAN_IMAGE_GET_LINE (
3244 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3245 :
3246 0 : xmm_def = create_mask_2x32_128 (src, src);
3247 0 : xmm_src = expand_pixel_32_1x128 (src);
3248 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
3249 0 : mmx_src = xmm_src;
3250 0 : mmx_alpha = xmm_alpha;
3251 :
3252 0 : while (height--)
3253 : {
3254 0 : dst = dst_line;
3255 0 : dst_line += dst_stride;
3256 0 : mask = mask_line;
3257 0 : mask_line += mask_stride;
3258 0 : w = width;
3259 :
3260 0 : while (w && (unsigned long)dst & 15)
3261 : {
3262 0 : uint8_t m = *mask++;
3263 :
3264 0 : if (m)
3265 : {
3266 0 : d = *dst;
3267 0 : mmx_mask = expand_pixel_8_1x128 (m);
3268 0 : mmx_dest = unpack_32_1x128 (d);
3269 :
3270 0 : *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3271 : &mmx_alpha,
3272 : &mmx_mask,
3273 : &mmx_dest));
3274 : }
3275 :
3276 0 : w--;
3277 0 : dst++;
3278 : }
3279 :
3280 0 : while (w >= 4)
3281 : {
3282 0 : m = *((uint32_t*)mask);
3283 :
3284 0 : if (srca == 0xff && m == 0xffffffff)
3285 : {
3286 0 : save_128_aligned ((__m128i*)dst, xmm_def);
3287 : }
3288 0 : else if (m)
3289 : {
3290 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3291 0 : xmm_mask = unpack_32_1x128 (m);
3292 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3293 :
3294 : /* Unpacking */
3295 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3296 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3297 :
3298 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3299 : &xmm_mask_lo, &xmm_mask_hi);
3300 :
3301 : in_over_2x128 (&xmm_src, &xmm_src,
3302 : &xmm_alpha, &xmm_alpha,
3303 : &xmm_mask_lo, &xmm_mask_hi,
3304 : &xmm_dst_lo, &xmm_dst_hi);
3305 :
3306 0 : save_128_aligned (
3307 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3308 : }
3309 :
3310 0 : w -= 4;
3311 0 : dst += 4;
3312 0 : mask += 4;
3313 : }
3314 :
3315 0 : while (w)
3316 : {
3317 0 : uint8_t m = *mask++;
3318 :
3319 0 : if (m)
3320 : {
3321 0 : d = *dst;
3322 0 : mmx_mask = expand_pixel_8_1x128 (m);
3323 0 : mmx_dest = unpack_32_1x128 (d);
3324 :
3325 0 : *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3326 : &mmx_alpha,
3327 : &mmx_mask,
3328 : &mmx_dest));
3329 : }
3330 :
3331 0 : w--;
3332 0 : dst++;
3333 : }
3334 : }
3335 :
3336 : }
3337 :
3338 : static pixman_bool_t
3339 17 : pixman_fill_sse2 (uint32_t *bits,
3340 : int stride,
3341 : int bpp,
3342 : int x,
3343 : int y,
3344 : int width,
3345 : int height,
3346 : uint32_t data)
3347 : {
3348 : uint32_t byte_width;
3349 : uint8_t *byte_line;
3350 :
3351 : __m128i xmm_def;
3352 :
3353 17 : if (bpp == 8)
3354 : {
3355 : uint8_t b;
3356 : uint16_t w;
3357 :
3358 0 : stride = stride * (int) sizeof (uint32_t) / 1;
3359 0 : byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3360 0 : byte_width = width;
3361 0 : stride *= 1;
3362 :
3363 0 : b = data & 0xff;
3364 0 : w = (b << 8) | b;
3365 0 : data = (w << 16) | w;
3366 : }
3367 17 : else if (bpp == 16)
3368 : {
3369 0 : stride = stride * (int) sizeof (uint32_t) / 2;
3370 0 : byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3371 0 : byte_width = 2 * width;
3372 0 : stride *= 2;
3373 :
3374 0 : data = (data & 0xffff) * 0x00010001;
3375 : }
3376 17 : else if (bpp == 32)
3377 : {
3378 17 : stride = stride * (int) sizeof (uint32_t) / 4;
3379 17 : byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3380 17 : byte_width = 4 * width;
3381 17 : stride *= 4;
3382 : }
3383 : else
3384 : {
3385 0 : return FALSE;
3386 : }
3387 :
3388 17 : xmm_def = create_mask_2x32_128 (data, data);
3389 :
3390 559 : while (height--)
3391 : {
3392 : int w;
3393 525 : uint8_t *d = byte_line;
3394 525 : byte_line += stride;
3395 525 : w = byte_width;
3396 :
3397 1050 : while (w >= 1 && ((unsigned long)d & 1))
3398 : {
3399 0 : *(uint8_t *)d = data;
3400 0 : w -= 1;
3401 0 : d += 1;
3402 : }
3403 :
3404 1050 : while (w >= 2 && ((unsigned long)d & 3))
3405 : {
3406 0 : *(uint16_t *)d = data;
3407 0 : w -= 2;
3408 0 : d += 2;
3409 : }
3410 :
3411 1293 : while (w >= 4 && ((unsigned long)d & 15))
3412 : {
3413 243 : *(uint32_t *)d = data;
3414 :
3415 243 : w -= 4;
3416 243 : d += 4;
3417 : }
3418 :
3419 1545 : while (w >= 128)
3420 : {
3421 495 : save_128_aligned ((__m128i*)(d), xmm_def);
3422 495 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3423 495 : save_128_aligned ((__m128i*)(d + 32), xmm_def);
3424 495 : save_128_aligned ((__m128i*)(d + 48), xmm_def);
3425 495 : save_128_aligned ((__m128i*)(d + 64), xmm_def);
3426 495 : save_128_aligned ((__m128i*)(d + 80), xmm_def);
3427 495 : save_128_aligned ((__m128i*)(d + 96), xmm_def);
3428 495 : save_128_aligned ((__m128i*)(d + 112), xmm_def);
3429 :
3430 495 : d += 128;
3431 495 : w -= 128;
3432 : }
3433 :
3434 525 : if (w >= 64)
3435 : {
3436 64 : save_128_aligned ((__m128i*)(d), xmm_def);
3437 64 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3438 64 : save_128_aligned ((__m128i*)(d + 32), xmm_def);
3439 64 : save_128_aligned ((__m128i*)(d + 48), xmm_def);
3440 :
3441 64 : d += 64;
3442 64 : w -= 64;
3443 : }
3444 :
3445 525 : if (w >= 32)
3446 : {
3447 0 : save_128_aligned ((__m128i*)(d), xmm_def);
3448 0 : save_128_aligned ((__m128i*)(d + 16), xmm_def);
3449 :
3450 0 : d += 32;
3451 0 : w -= 32;
3452 : }
3453 :
3454 525 : if (w >= 16)
3455 : {
3456 4 : save_128_aligned ((__m128i*)(d), xmm_def);
3457 :
3458 4 : d += 16;
3459 4 : w -= 16;
3460 : }
3461 :
3462 1299 : while (w >= 4)
3463 : {
3464 249 : *(uint32_t *)d = data;
3465 :
3466 249 : w -= 4;
3467 249 : d += 4;
3468 : }
3469 :
3470 525 : if (w >= 2)
3471 : {
3472 0 : *(uint16_t *)d = data;
3473 0 : w -= 2;
3474 0 : d += 2;
3475 : }
3476 :
3477 525 : if (w >= 1)
3478 : {
3479 0 : *(uint8_t *)d = data;
3480 0 : w -= 1;
3481 0 : d += 1;
3482 : }
3483 : }
3484 :
3485 17 : return TRUE;
3486 : }
3487 :
3488 : static void
3489 0 : sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3490 : pixman_op_t op,
3491 : pixman_image_t * src_image,
3492 : pixman_image_t * mask_image,
3493 : pixman_image_t * dst_image,
3494 : int32_t src_x,
3495 : int32_t src_y,
3496 : int32_t mask_x,
3497 : int32_t mask_y,
3498 : int32_t dest_x,
3499 : int32_t dest_y,
3500 : int32_t width,
3501 : int32_t height)
3502 : {
3503 : uint32_t src, srca;
3504 : uint32_t *dst_line, *dst;
3505 : uint8_t *mask_line, *mask;
3506 : int dst_stride, mask_stride;
3507 : int32_t w;
3508 : uint32_t m;
3509 :
3510 : __m128i xmm_src, xmm_def;
3511 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3512 :
3513 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3514 :
3515 0 : srca = src >> 24;
3516 0 : if (src == 0)
3517 : {
3518 0 : pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3519 0 : PIXMAN_FORMAT_BPP (dst_image->bits.format),
3520 : dest_x, dest_y, width, height, 0);
3521 0 : return;
3522 : }
3523 :
3524 0 : PIXMAN_IMAGE_GET_LINE (
3525 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3526 0 : PIXMAN_IMAGE_GET_LINE (
3527 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3528 :
3529 0 : xmm_def = create_mask_2x32_128 (src, src);
3530 0 : xmm_src = expand_pixel_32_1x128 (src);
3531 :
3532 0 : while (height--)
3533 : {
3534 0 : dst = dst_line;
3535 0 : dst_line += dst_stride;
3536 0 : mask = mask_line;
3537 0 : mask_line += mask_stride;
3538 0 : w = width;
3539 :
3540 0 : while (w && (unsigned long)dst & 15)
3541 : {
3542 0 : uint8_t m = *mask++;
3543 :
3544 0 : if (m)
3545 : {
3546 0 : *dst = pack_1x128_32 (
3547 : pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3548 : }
3549 : else
3550 : {
3551 0 : *dst = 0;
3552 : }
3553 :
3554 0 : w--;
3555 0 : dst++;
3556 : }
3557 :
3558 0 : while (w >= 4)
3559 : {
3560 0 : m = *((uint32_t*)mask);
3561 :
3562 0 : if (srca == 0xff && m == 0xffffffff)
3563 : {
3564 0 : save_128_aligned ((__m128i*)dst, xmm_def);
3565 : }
3566 0 : else if (m)
3567 : {
3568 0 : xmm_mask = unpack_32_1x128 (m);
3569 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3570 :
3571 : /* Unpacking */
3572 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3573 :
3574 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3575 : &xmm_mask_lo, &xmm_mask_hi);
3576 :
3577 : pix_multiply_2x128 (&xmm_src, &xmm_src,
3578 : &xmm_mask_lo, &xmm_mask_hi,
3579 : &xmm_mask_lo, &xmm_mask_hi);
3580 :
3581 0 : save_128_aligned (
3582 : (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3583 : }
3584 : else
3585 : {
3586 0 : save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3587 : }
3588 :
3589 0 : w -= 4;
3590 0 : dst += 4;
3591 0 : mask += 4;
3592 : }
3593 :
3594 0 : while (w)
3595 : {
3596 0 : uint8_t m = *mask++;
3597 :
3598 0 : if (m)
3599 : {
3600 0 : *dst = pack_1x128_32 (
3601 : pix_multiply_1x128 (
3602 : xmm_src, expand_pixel_8_1x128 (m)));
3603 : }
3604 : else
3605 : {
3606 0 : *dst = 0;
3607 : }
3608 :
3609 0 : w--;
3610 0 : dst++;
3611 : }
3612 : }
3613 :
3614 : }
3615 :
3616 : static void
3617 0 : sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3618 : pixman_op_t op,
3619 : pixman_image_t * src_image,
3620 : pixman_image_t * mask_image,
3621 : pixman_image_t * dst_image,
3622 : int32_t src_x,
3623 : int32_t src_y,
3624 : int32_t mask_x,
3625 : int32_t mask_y,
3626 : int32_t dest_x,
3627 : int32_t dest_y,
3628 : int32_t width,
3629 : int32_t height)
3630 : {
3631 : uint32_t src, srca;
3632 : uint16_t *dst_line, *dst, d;
3633 : uint8_t *mask_line, *mask;
3634 : int dst_stride, mask_stride;
3635 : int32_t w;
3636 : uint32_t m;
3637 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3638 :
3639 : __m128i xmm_src, xmm_alpha;
3640 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3641 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3642 :
3643 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3644 :
3645 0 : srca = src >> 24;
3646 0 : if (src == 0)
3647 0 : return;
3648 :
3649 0 : PIXMAN_IMAGE_GET_LINE (
3650 : dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3651 0 : PIXMAN_IMAGE_GET_LINE (
3652 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3653 :
3654 0 : xmm_src = expand_pixel_32_1x128 (src);
3655 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
3656 0 : mmx_src = xmm_src;
3657 0 : mmx_alpha = xmm_alpha;
3658 :
3659 0 : while (height--)
3660 : {
3661 0 : dst = dst_line;
3662 0 : dst_line += dst_stride;
3663 0 : mask = mask_line;
3664 0 : mask_line += mask_stride;
3665 0 : w = width;
3666 :
3667 0 : while (w && (unsigned long)dst & 15)
3668 : {
3669 0 : m = *mask++;
3670 :
3671 0 : if (m)
3672 : {
3673 0 : d = *dst;
3674 0 : mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3675 0 : mmx_dest = expand565_16_1x128 (d);
3676 :
3677 0 : *dst = pack_565_32_16 (
3678 : pack_1x128_32 (
3679 : in_over_1x128 (
3680 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3681 : }
3682 :
3683 0 : w--;
3684 0 : dst++;
3685 : }
3686 :
3687 0 : while (w >= 8)
3688 : {
3689 0 : xmm_dst = load_128_aligned ((__m128i*) dst);
3690 0 : unpack_565_128_4x128 (xmm_dst,
3691 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3692 :
3693 0 : m = *((uint32_t*)mask);
3694 0 : mask += 4;
3695 :
3696 0 : if (m)
3697 : {
3698 0 : xmm_mask = unpack_32_1x128 (m);
3699 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3700 :
3701 : /* Unpacking */
3702 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3703 :
3704 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3705 : &xmm_mask_lo, &xmm_mask_hi);
3706 :
3707 : in_over_2x128 (&xmm_src, &xmm_src,
3708 : &xmm_alpha, &xmm_alpha,
3709 : &xmm_mask_lo, &xmm_mask_hi,
3710 : &xmm_dst0, &xmm_dst1);
3711 : }
3712 :
3713 0 : m = *((uint32_t*)mask);
3714 0 : mask += 4;
3715 :
3716 0 : if (m)
3717 : {
3718 0 : xmm_mask = unpack_32_1x128 (m);
3719 0 : xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3720 :
3721 : /* Unpacking */
3722 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3723 :
3724 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3725 : &xmm_mask_lo, &xmm_mask_hi);
3726 : in_over_2x128 (&xmm_src, &xmm_src,
3727 : &xmm_alpha, &xmm_alpha,
3728 : &xmm_mask_lo, &xmm_mask_hi,
3729 : &xmm_dst2, &xmm_dst3);
3730 : }
3731 :
3732 0 : save_128_aligned (
3733 : (__m128i*)dst, pack_565_4x128_128 (
3734 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3735 :
3736 0 : w -= 8;
3737 0 : dst += 8;
3738 : }
3739 :
3740 0 : while (w)
3741 : {
3742 0 : m = *mask++;
3743 :
3744 0 : if (m)
3745 : {
3746 0 : d = *dst;
3747 0 : mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3748 0 : mmx_dest = expand565_16_1x128 (d);
3749 :
3750 0 : *dst = pack_565_32_16 (
3751 : pack_1x128_32 (
3752 : in_over_1x128 (
3753 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3754 : }
3755 :
3756 0 : w--;
3757 0 : dst++;
3758 : }
3759 : }
3760 :
3761 : }
3762 :
3763 : static void
3764 0 : sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3765 : pixman_op_t op,
3766 : pixman_image_t * src_image,
3767 : pixman_image_t * mask_image,
3768 : pixman_image_t * dst_image,
3769 : int32_t src_x,
3770 : int32_t src_y,
3771 : int32_t mask_x,
3772 : int32_t mask_y,
3773 : int32_t dest_x,
3774 : int32_t dest_y,
3775 : int32_t width,
3776 : int32_t height)
3777 : {
3778 : uint16_t *dst_line, *dst, d;
3779 : uint32_t *src_line, *src, s;
3780 : int dst_stride, src_stride;
3781 : int32_t w;
3782 : uint32_t opaque, zero;
3783 :
3784 : __m128i ms;
3785 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3786 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3787 :
3788 0 : PIXMAN_IMAGE_GET_LINE (
3789 : dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3790 0 : PIXMAN_IMAGE_GET_LINE (
3791 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3792 :
3793 0 : while (height--)
3794 : {
3795 0 : dst = dst_line;
3796 0 : dst_line += dst_stride;
3797 0 : src = src_line;
3798 0 : src_line += src_stride;
3799 0 : w = width;
3800 :
3801 0 : while (w && (unsigned long)dst & 15)
3802 : {
3803 0 : s = *src++;
3804 0 : d = *dst;
3805 :
3806 0 : ms = unpack_32_1x128 (s);
3807 :
3808 0 : *dst++ = pack_565_32_16 (
3809 : pack_1x128_32 (
3810 : over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3811 0 : w--;
3812 : }
3813 :
3814 0 : while (w >= 8)
3815 : {
3816 : /* First round */
3817 0 : xmm_src = load_128_unaligned ((__m128i*)src);
3818 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
3819 :
3820 0 : opaque = is_opaque (xmm_src);
3821 0 : zero = is_zero (xmm_src);
3822 :
3823 0 : unpack_565_128_4x128 (xmm_dst,
3824 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3825 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3826 :
3827 : /* preload next round*/
3828 0 : xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3829 :
3830 0 : if (opaque)
3831 : {
3832 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3833 : &xmm_dst0, &xmm_dst1);
3834 : }
3835 0 : else if (!zero)
3836 : {
3837 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3838 : &xmm_dst0, &xmm_dst1);
3839 : }
3840 :
3841 : /* Second round */
3842 0 : opaque = is_opaque (xmm_src);
3843 0 : zero = is_zero (xmm_src);
3844 :
3845 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3846 :
3847 0 : if (opaque)
3848 : {
3849 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3850 : &xmm_dst2, &xmm_dst3);
3851 : }
3852 0 : else if (!zero)
3853 : {
3854 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3855 : &xmm_dst2, &xmm_dst3);
3856 : }
3857 :
3858 0 : save_128_aligned (
3859 : (__m128i*)dst, pack_565_4x128_128 (
3860 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3861 :
3862 0 : w -= 8;
3863 0 : src += 8;
3864 0 : dst += 8;
3865 : }
3866 :
3867 0 : while (w)
3868 : {
3869 0 : s = *src++;
3870 0 : d = *dst;
3871 :
3872 0 : ms = unpack_32_1x128 (s);
3873 :
3874 0 : *dst++ = pack_565_32_16 (
3875 : pack_1x128_32 (
3876 : over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3877 0 : w--;
3878 : }
3879 : }
3880 :
3881 0 : }
3882 :
3883 : static void
3884 0 : sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3885 : pixman_op_t op,
3886 : pixman_image_t * src_image,
3887 : pixman_image_t * mask_image,
3888 : pixman_image_t * dst_image,
3889 : int32_t src_x,
3890 : int32_t src_y,
3891 : int32_t mask_x,
3892 : int32_t mask_y,
3893 : int32_t dest_x,
3894 : int32_t dest_y,
3895 : int32_t width,
3896 : int32_t height)
3897 : {
3898 : uint32_t *dst_line, *dst, d;
3899 : uint32_t *src_line, *src, s;
3900 : int dst_stride, src_stride;
3901 : int32_t w;
3902 : uint32_t opaque, zero;
3903 :
3904 : __m128i xmm_src_lo, xmm_src_hi;
3905 : __m128i xmm_dst_lo, xmm_dst_hi;
3906 :
3907 0 : PIXMAN_IMAGE_GET_LINE (
3908 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3909 0 : PIXMAN_IMAGE_GET_LINE (
3910 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3911 :
3912 0 : while (height--)
3913 : {
3914 0 : dst = dst_line;
3915 0 : dst_line += dst_stride;
3916 0 : src = src_line;
3917 0 : src_line += src_stride;
3918 0 : w = width;
3919 :
3920 0 : while (w && (unsigned long)dst & 15)
3921 : {
3922 0 : s = *src++;
3923 0 : d = *dst;
3924 :
3925 0 : *dst++ = pack_1x128_32 (
3926 : over_rev_non_pre_1x128 (
3927 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
3928 :
3929 0 : w--;
3930 : }
3931 :
3932 0 : while (w >= 4)
3933 : {
3934 0 : xmm_src_hi = load_128_unaligned ((__m128i*)src);
3935 :
3936 0 : opaque = is_opaque (xmm_src_hi);
3937 0 : zero = is_zero (xmm_src_hi);
3938 :
3939 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3940 :
3941 0 : if (opaque)
3942 : {
3943 0 : invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3944 : &xmm_dst_lo, &xmm_dst_hi);
3945 :
3946 0 : save_128_aligned (
3947 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3948 : }
3949 0 : else if (!zero)
3950 : {
3951 0 : xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3952 :
3953 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3954 :
3955 0 : over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3956 : &xmm_dst_lo, &xmm_dst_hi);
3957 :
3958 0 : save_128_aligned (
3959 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3960 : }
3961 :
3962 0 : w -= 4;
3963 0 : dst += 4;
3964 0 : src += 4;
3965 : }
3966 :
3967 0 : while (w)
3968 : {
3969 0 : s = *src++;
3970 0 : d = *dst;
3971 :
3972 0 : *dst++ = pack_1x128_32 (
3973 : over_rev_non_pre_1x128 (
3974 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
3975 :
3976 0 : w--;
3977 : }
3978 : }
3979 :
3980 0 : }
3981 :
3982 : static void
3983 0 : sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3984 : pixman_op_t op,
3985 : pixman_image_t * src_image,
3986 : pixman_image_t * mask_image,
3987 : pixman_image_t * dst_image,
3988 : int32_t src_x,
3989 : int32_t src_y,
3990 : int32_t mask_x,
3991 : int32_t mask_y,
3992 : int32_t dest_x,
3993 : int32_t dest_y,
3994 : int32_t width,
3995 : int32_t height)
3996 : {
3997 : uint32_t src;
3998 : uint16_t *dst_line, *dst, d;
3999 : uint32_t *mask_line, *mask, m;
4000 : int dst_stride, mask_stride;
4001 : int w;
4002 : uint32_t pack_cmp;
4003 :
4004 : __m128i xmm_src, xmm_alpha;
4005 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4006 : __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4007 :
4008 : __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4009 :
4010 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4011 :
4012 0 : if (src == 0)
4013 0 : return;
4014 :
4015 0 : PIXMAN_IMAGE_GET_LINE (
4016 : dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4017 0 : PIXMAN_IMAGE_GET_LINE (
4018 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4019 :
4020 0 : xmm_src = expand_pixel_32_1x128 (src);
4021 0 : xmm_alpha = expand_alpha_1x128 (xmm_src);
4022 0 : mmx_src = xmm_src;
4023 0 : mmx_alpha = xmm_alpha;
4024 :
4025 0 : while (height--)
4026 : {
4027 0 : w = width;
4028 0 : mask = mask_line;
4029 0 : dst = dst_line;
4030 0 : mask_line += mask_stride;
4031 0 : dst_line += dst_stride;
4032 :
4033 0 : while (w && ((unsigned long)dst & 15))
4034 : {
4035 0 : m = *(uint32_t *) mask;
4036 :
4037 0 : if (m)
4038 : {
4039 0 : d = *dst;
4040 0 : mmx_mask = unpack_32_1x128 (m);
4041 0 : mmx_dest = expand565_16_1x128 (d);
4042 :
4043 0 : *dst = pack_565_32_16 (
4044 : pack_1x128_32 (
4045 : in_over_1x128 (
4046 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4047 : }
4048 :
4049 0 : w--;
4050 0 : dst++;
4051 0 : mask++;
4052 : }
4053 :
4054 0 : while (w >= 8)
4055 : {
4056 : /* First round */
4057 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
4058 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4059 :
4060 0 : pack_cmp = _mm_movemask_epi8 (
4061 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4062 :
4063 0 : unpack_565_128_4x128 (xmm_dst,
4064 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4065 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4066 :
4067 : /* preload next round */
4068 0 : xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4069 :
4070 : /* preload next round */
4071 0 : if (pack_cmp != 0xffff)
4072 : {
4073 : in_over_2x128 (&xmm_src, &xmm_src,
4074 : &xmm_alpha, &xmm_alpha,
4075 : &xmm_mask_lo, &xmm_mask_hi,
4076 : &xmm_dst0, &xmm_dst1);
4077 : }
4078 :
4079 : /* Second round */
4080 0 : pack_cmp = _mm_movemask_epi8 (
4081 : _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4082 :
4083 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4084 :
4085 0 : if (pack_cmp != 0xffff)
4086 : {
4087 : in_over_2x128 (&xmm_src, &xmm_src,
4088 : &xmm_alpha, &xmm_alpha,
4089 : &xmm_mask_lo, &xmm_mask_hi,
4090 : &xmm_dst2, &xmm_dst3);
4091 : }
4092 :
4093 0 : save_128_aligned (
4094 : (__m128i*)dst, pack_565_4x128_128 (
4095 : &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4096 :
4097 0 : w -= 8;
4098 0 : dst += 8;
4099 0 : mask += 8;
4100 : }
4101 :
4102 0 : while (w)
4103 : {
4104 0 : m = *(uint32_t *) mask;
4105 :
4106 0 : if (m)
4107 : {
4108 0 : d = *dst;
4109 0 : mmx_mask = unpack_32_1x128 (m);
4110 0 : mmx_dest = expand565_16_1x128 (d);
4111 :
4112 0 : *dst = pack_565_32_16 (
4113 : pack_1x128_32 (
4114 : in_over_1x128 (
4115 : &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4116 : }
4117 :
4118 0 : w--;
4119 0 : dst++;
4120 0 : mask++;
4121 : }
4122 : }
4123 :
4124 : }
4125 :
4126 : static void
4127 0 : sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4128 : pixman_op_t op,
4129 : pixman_image_t * src_image,
4130 : pixman_image_t * mask_image,
4131 : pixman_image_t * dst_image,
4132 : int32_t src_x,
4133 : int32_t src_y,
4134 : int32_t mask_x,
4135 : int32_t mask_y,
4136 : int32_t dest_x,
4137 : int32_t dest_y,
4138 : int32_t width,
4139 : int32_t height)
4140 : {
4141 : uint8_t *dst_line, *dst;
4142 : uint8_t *mask_line, *mask;
4143 : int dst_stride, mask_stride;
4144 : uint32_t d, m;
4145 : uint32_t src;
4146 : uint8_t sa;
4147 : int32_t w;
4148 :
4149 : __m128i xmm_alpha;
4150 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4151 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152 :
4153 0 : PIXMAN_IMAGE_GET_LINE (
4154 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155 0 : PIXMAN_IMAGE_GET_LINE (
4156 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4157 :
4158 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4159 :
4160 0 : sa = src >> 24;
4161 :
4162 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4163 :
4164 0 : while (height--)
4165 : {
4166 0 : dst = dst_line;
4167 0 : dst_line += dst_stride;
4168 0 : mask = mask_line;
4169 0 : mask_line += mask_stride;
4170 0 : w = width;
4171 :
4172 0 : while (w && ((unsigned long)dst & 15))
4173 : {
4174 0 : m = (uint32_t) *mask++;
4175 0 : d = (uint32_t) *dst;
4176 :
4177 0 : *dst++ = (uint8_t) pack_1x128_32 (
4178 : pix_multiply_1x128 (
4179 : pix_multiply_1x128 (xmm_alpha,
4180 : unpack_32_1x128 (m)),
4181 : unpack_32_1x128 (d)));
4182 0 : w--;
4183 : }
4184 :
4185 0 : while (w >= 16)
4186 : {
4187 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
4188 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4189 :
4190 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4191 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4192 :
4193 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4194 : &xmm_mask_lo, &xmm_mask_hi,
4195 : &xmm_mask_lo, &xmm_mask_hi);
4196 :
4197 : pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4198 : &xmm_dst_lo, &xmm_dst_hi,
4199 : &xmm_dst_lo, &xmm_dst_hi);
4200 :
4201 0 : save_128_aligned (
4202 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4203 :
4204 0 : mask += 16;
4205 0 : dst += 16;
4206 0 : w -= 16;
4207 : }
4208 :
4209 0 : while (w)
4210 : {
4211 0 : m = (uint32_t) *mask++;
4212 0 : d = (uint32_t) *dst;
4213 :
4214 0 : *dst++ = (uint8_t) pack_1x128_32 (
4215 : pix_multiply_1x128 (
4216 : pix_multiply_1x128 (
4217 : xmm_alpha, unpack_32_1x128 (m)),
4218 : unpack_32_1x128 (d)));
4219 0 : w--;
4220 : }
4221 : }
4222 :
4223 0 : }
4224 :
4225 : static void
4226 0 : sse2_composite_in_n_8 (pixman_implementation_t *imp,
4227 : pixman_op_t op,
4228 : pixman_image_t * src_image,
4229 : pixman_image_t * mask_image,
4230 : pixman_image_t * dst_image,
4231 : int32_t src_x,
4232 : int32_t src_y,
4233 : int32_t mask_x,
4234 : int32_t mask_y,
4235 : int32_t dest_x,
4236 : int32_t dest_y,
4237 : int32_t width,
4238 : int32_t height)
4239 : {
4240 : uint8_t *dst_line, *dst;
4241 : int dst_stride;
4242 : uint32_t d;
4243 : uint32_t src;
4244 : int32_t w;
4245 :
4246 : __m128i xmm_alpha;
4247 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4248 :
4249 0 : PIXMAN_IMAGE_GET_LINE (
4250 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4251 :
4252 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4253 :
4254 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4255 :
4256 0 : src = src >> 24;
4257 :
4258 0 : if (src == 0xff)
4259 0 : return;
4260 :
4261 0 : if (src == 0x00)
4262 : {
4263 0 : pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4264 : 8, dest_x, dest_y, width, height, src);
4265 :
4266 0 : return;
4267 : }
4268 :
4269 0 : while (height--)
4270 : {
4271 0 : dst = dst_line;
4272 0 : dst_line += dst_stride;
4273 0 : w = width;
4274 :
4275 0 : while (w && ((unsigned long)dst & 15))
4276 : {
4277 0 : d = (uint32_t) *dst;
4278 :
4279 0 : *dst++ = (uint8_t) pack_1x128_32 (
4280 : pix_multiply_1x128 (
4281 : xmm_alpha,
4282 : unpack_32_1x128 (d)));
4283 0 : w--;
4284 : }
4285 :
4286 0 : while (w >= 16)
4287 : {
4288 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4289 :
4290 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4291 :
4292 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4293 : &xmm_dst_lo, &xmm_dst_hi,
4294 : &xmm_dst_lo, &xmm_dst_hi);
4295 :
4296 0 : save_128_aligned (
4297 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4298 :
4299 0 : dst += 16;
4300 0 : w -= 16;
4301 : }
4302 :
4303 0 : while (w)
4304 : {
4305 0 : d = (uint32_t) *dst;
4306 :
4307 0 : *dst++ = (uint8_t) pack_1x128_32 (
4308 : pix_multiply_1x128 (
4309 : xmm_alpha,
4310 : unpack_32_1x128 (d)));
4311 0 : w--;
4312 : }
4313 : }
4314 :
4315 : }
4316 :
4317 : static void
4318 0 : sse2_composite_in_8_8 (pixman_implementation_t *imp,
4319 : pixman_op_t op,
4320 : pixman_image_t * src_image,
4321 : pixman_image_t * mask_image,
4322 : pixman_image_t * dst_image,
4323 : int32_t src_x,
4324 : int32_t src_y,
4325 : int32_t mask_x,
4326 : int32_t mask_y,
4327 : int32_t dest_x,
4328 : int32_t dest_y,
4329 : int32_t width,
4330 : int32_t height)
4331 : {
4332 : uint8_t *dst_line, *dst;
4333 : uint8_t *src_line, *src;
4334 : int src_stride, dst_stride;
4335 : int32_t w;
4336 : uint32_t s, d;
4337 :
4338 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4339 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4340 :
4341 0 : PIXMAN_IMAGE_GET_LINE (
4342 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4343 0 : PIXMAN_IMAGE_GET_LINE (
4344 : src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4345 :
4346 0 : while (height--)
4347 : {
4348 0 : dst = dst_line;
4349 0 : dst_line += dst_stride;
4350 0 : src = src_line;
4351 0 : src_line += src_stride;
4352 0 : w = width;
4353 :
4354 0 : while (w && ((unsigned long)dst & 15))
4355 : {
4356 0 : s = (uint32_t) *src++;
4357 0 : d = (uint32_t) *dst;
4358 :
4359 0 : *dst++ = (uint8_t) pack_1x128_32 (
4360 : pix_multiply_1x128 (
4361 : unpack_32_1x128 (s), unpack_32_1x128 (d)));
4362 0 : w--;
4363 : }
4364 :
4365 0 : while (w >= 16)
4366 : {
4367 0 : xmm_src = load_128_unaligned ((__m128i*)src);
4368 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4369 :
4370 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4371 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4372 :
4373 : pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4374 : &xmm_dst_lo, &xmm_dst_hi,
4375 : &xmm_dst_lo, &xmm_dst_hi);
4376 :
4377 0 : save_128_aligned (
4378 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4379 :
4380 0 : src += 16;
4381 0 : dst += 16;
4382 0 : w -= 16;
4383 : }
4384 :
4385 0 : while (w)
4386 : {
4387 0 : s = (uint32_t) *src++;
4388 0 : d = (uint32_t) *dst;
4389 :
4390 0 : *dst++ = (uint8_t) pack_1x128_32 (
4391 : pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4392 0 : w--;
4393 : }
4394 : }
4395 :
4396 0 : }
4397 :
4398 : static void
4399 0 : sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4400 : pixman_op_t op,
4401 : pixman_image_t * src_image,
4402 : pixman_image_t * mask_image,
4403 : pixman_image_t * dst_image,
4404 : int32_t src_x,
4405 : int32_t src_y,
4406 : int32_t mask_x,
4407 : int32_t mask_y,
4408 : int32_t dest_x,
4409 : int32_t dest_y,
4410 : int32_t width,
4411 : int32_t height)
4412 : {
4413 : uint8_t *dst_line, *dst;
4414 : uint8_t *mask_line, *mask;
4415 : int dst_stride, mask_stride;
4416 : int32_t w;
4417 : uint32_t src;
4418 : uint8_t sa;
4419 : uint32_t m, d;
4420 :
4421 : __m128i xmm_alpha;
4422 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4423 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4424 :
4425 0 : PIXMAN_IMAGE_GET_LINE (
4426 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4427 0 : PIXMAN_IMAGE_GET_LINE (
4428 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4429 :
4430 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4431 :
4432 0 : sa = src >> 24;
4433 :
4434 0 : xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4435 :
4436 0 : while (height--)
4437 : {
4438 0 : dst = dst_line;
4439 0 : dst_line += dst_stride;
4440 0 : mask = mask_line;
4441 0 : mask_line += mask_stride;
4442 0 : w = width;
4443 :
4444 0 : while (w && ((unsigned long)dst & 15))
4445 : {
4446 0 : m = (uint32_t) *mask++;
4447 0 : d = (uint32_t) *dst;
4448 :
4449 0 : *dst++ = (uint8_t) pack_1x128_32 (
4450 : _mm_adds_epu16 (
4451 : pix_multiply_1x128 (
4452 : xmm_alpha, unpack_32_1x128 (m)),
4453 : unpack_32_1x128 (d)));
4454 0 : w--;
4455 : }
4456 :
4457 0 : while (w >= 16)
4458 : {
4459 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
4460 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4461 :
4462 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4463 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4464 :
4465 : pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4466 : &xmm_mask_lo, &xmm_mask_hi,
4467 : &xmm_mask_lo, &xmm_mask_hi);
4468 :
4469 0 : xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4470 0 : xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4471 :
4472 0 : save_128_aligned (
4473 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4474 :
4475 0 : mask += 16;
4476 0 : dst += 16;
4477 0 : w -= 16;
4478 : }
4479 :
4480 0 : while (w)
4481 : {
4482 0 : m = (uint32_t) *mask++;
4483 0 : d = (uint32_t) *dst;
4484 :
4485 0 : *dst++ = (uint8_t) pack_1x128_32 (
4486 : _mm_adds_epu16 (
4487 : pix_multiply_1x128 (
4488 : xmm_alpha, unpack_32_1x128 (m)),
4489 : unpack_32_1x128 (d)));
4490 :
4491 0 : w--;
4492 : }
4493 : }
4494 :
4495 0 : }
4496 :
4497 : static void
4498 0 : sse2_composite_add_n_8 (pixman_implementation_t *imp,
4499 : pixman_op_t op,
4500 : pixman_image_t * src_image,
4501 : pixman_image_t * mask_image,
4502 : pixman_image_t * dst_image,
4503 : int32_t src_x,
4504 : int32_t src_y,
4505 : int32_t mask_x,
4506 : int32_t mask_y,
4507 : int32_t dest_x,
4508 : int32_t dest_y,
4509 : int32_t width,
4510 : int32_t height)
4511 : {
4512 : uint8_t *dst_line, *dst;
4513 : int dst_stride;
4514 : int32_t w;
4515 : uint32_t src;
4516 :
4517 : __m128i xmm_src;
4518 :
4519 0 : PIXMAN_IMAGE_GET_LINE (
4520 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4521 :
4522 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4523 :
4524 0 : src >>= 24;
4525 :
4526 0 : if (src == 0x00)
4527 0 : return;
4528 :
4529 0 : if (src == 0xff)
4530 : {
4531 0 : pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4532 : 8, dest_x, dest_y, width, height, 0xff);
4533 :
4534 0 : return;
4535 : }
4536 :
4537 0 : src = (src << 24) | (src << 16) | (src << 8) | src;
4538 0 : xmm_src = _mm_set_epi32 (src, src, src, src);
4539 :
4540 0 : while (height--)
4541 : {
4542 0 : dst = dst_line;
4543 0 : dst_line += dst_stride;
4544 0 : w = width;
4545 :
4546 0 : while (w && ((unsigned long)dst & 15))
4547 : {
4548 0 : *dst = (uint8_t)_mm_cvtsi128_si32 (
4549 : _mm_adds_epu8 (
4550 : xmm_src,
4551 0 : _mm_cvtsi32_si128 (*dst)));
4552 :
4553 0 : w--;
4554 0 : dst++;
4555 : }
4556 :
4557 0 : while (w >= 16)
4558 : {
4559 0 : save_128_aligned (
4560 : (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4561 :
4562 0 : dst += 16;
4563 0 : w -= 16;
4564 : }
4565 :
4566 0 : while (w)
4567 : {
4568 0 : *dst = (uint8_t)_mm_cvtsi128_si32 (
4569 : _mm_adds_epu8 (
4570 : xmm_src,
4571 0 : _mm_cvtsi32_si128 (*dst)));
4572 :
4573 0 : w--;
4574 0 : dst++;
4575 : }
4576 : }
4577 :
4578 : }
4579 :
4580 : static void
4581 0 : sse2_composite_add_8_8 (pixman_implementation_t *imp,
4582 : pixman_op_t op,
4583 : pixman_image_t * src_image,
4584 : pixman_image_t * mask_image,
4585 : pixman_image_t * dst_image,
4586 : int32_t src_x,
4587 : int32_t src_y,
4588 : int32_t mask_x,
4589 : int32_t mask_y,
4590 : int32_t dest_x,
4591 : int32_t dest_y,
4592 : int32_t width,
4593 : int32_t height)
4594 : {
4595 : uint8_t *dst_line, *dst;
4596 : uint8_t *src_line, *src;
4597 : int dst_stride, src_stride;
4598 : int32_t w;
4599 : uint16_t t;
4600 :
4601 0 : PIXMAN_IMAGE_GET_LINE (
4602 : src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4603 0 : PIXMAN_IMAGE_GET_LINE (
4604 : dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4605 :
4606 0 : while (height--)
4607 : {
4608 0 : dst = dst_line;
4609 0 : src = src_line;
4610 :
4611 0 : dst_line += dst_stride;
4612 0 : src_line += src_stride;
4613 0 : w = width;
4614 :
4615 : /* Small head */
4616 0 : while (w && (unsigned long)dst & 3)
4617 : {
4618 0 : t = (*dst) + (*src++);
4619 0 : *dst++ = t | (0 - (t >> 8));
4620 0 : w--;
4621 : }
4622 :
4623 0 : sse2_combine_add_u (imp, op,
4624 : (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4625 :
4626 : /* Small tail */
4627 0 : dst += w & 0xfffc;
4628 0 : src += w & 0xfffc;
4629 :
4630 0 : w &= 3;
4631 :
4632 0 : while (w)
4633 : {
4634 0 : t = (*dst) + (*src++);
4635 0 : *dst++ = t | (0 - (t >> 8));
4636 0 : w--;
4637 : }
4638 : }
4639 :
4640 0 : }
4641 :
4642 : static void
4643 0 : sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4644 : pixman_op_t op,
4645 : pixman_image_t * src_image,
4646 : pixman_image_t * mask_image,
4647 : pixman_image_t * dst_image,
4648 : int32_t src_x,
4649 : int32_t src_y,
4650 : int32_t mask_x,
4651 : int32_t mask_y,
4652 : int32_t dest_x,
4653 : int32_t dest_y,
4654 : int32_t width,
4655 : int32_t height)
4656 : {
4657 : uint32_t *dst_line, *dst;
4658 : uint32_t *src_line, *src;
4659 : int dst_stride, src_stride;
4660 :
4661 0 : PIXMAN_IMAGE_GET_LINE (
4662 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4663 0 : PIXMAN_IMAGE_GET_LINE (
4664 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4665 :
4666 0 : while (height--)
4667 : {
4668 0 : dst = dst_line;
4669 0 : dst_line += dst_stride;
4670 0 : src = src_line;
4671 0 : src_line += src_stride;
4672 :
4673 : sse2_combine_add_u (imp, op, dst, src, NULL, width);
4674 : }
4675 :
4676 0 : }
4677 :
4678 : static pixman_bool_t
4679 27 : pixman_blt_sse2 (uint32_t *src_bits,
4680 : uint32_t *dst_bits,
4681 : int src_stride,
4682 : int dst_stride,
4683 : int src_bpp,
4684 : int dst_bpp,
4685 : int src_x,
4686 : int src_y,
4687 : int dst_x,
4688 : int dst_y,
4689 : int width,
4690 : int height)
4691 : {
4692 : uint8_t * src_bytes;
4693 : uint8_t * dst_bytes;
4694 : int byte_width;
4695 :
4696 27 : if (src_bpp != dst_bpp)
4697 0 : return FALSE;
4698 :
4699 27 : if (src_bpp == 16)
4700 : {
4701 0 : src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4702 0 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4703 0 : src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4704 0 : dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4705 0 : byte_width = 2 * width;
4706 0 : src_stride *= 2;
4707 0 : dst_stride *= 2;
4708 : }
4709 27 : else if (src_bpp == 32)
4710 : {
4711 27 : src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4712 27 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4713 27 : src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4714 27 : dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4715 27 : byte_width = 4 * width;
4716 27 : src_stride *= 4;
4717 27 : dst_stride *= 4;
4718 : }
4719 : else
4720 : {
4721 0 : return FALSE;
4722 : }
4723 :
4724 1047 : while (height--)
4725 : {
4726 : int w;
4727 993 : uint8_t *s = src_bytes;
4728 993 : uint8_t *d = dst_bytes;
4729 993 : src_bytes += src_stride;
4730 993 : dst_bytes += dst_stride;
4731 993 : w = byte_width;
4732 :
4733 1986 : while (w >= 2 && ((unsigned long)d & 3))
4734 : {
4735 0 : *(uint16_t *)d = *(uint16_t *)s;
4736 0 : w -= 2;
4737 0 : s += 2;
4738 0 : d += 2;
4739 : }
4740 :
4741 2382 : while (w >= 4 && ((unsigned long)d & 15))
4742 : {
4743 396 : *(uint32_t *)d = *(uint32_t *)s;
4744 :
4745 396 : w -= 4;
4746 396 : s += 4;
4747 396 : d += 4;
4748 : }
4749 :
4750 4576 : while (w >= 64)
4751 : {
4752 : __m128i xmm0, xmm1, xmm2, xmm3;
4753 :
4754 5180 : xmm0 = load_128_unaligned ((__m128i*)(s));
4755 5180 : xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4756 5180 : xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4757 5180 : xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4758 :
4759 2590 : save_128_aligned ((__m128i*)(d), xmm0);
4760 2590 : save_128_aligned ((__m128i*)(d + 16), xmm1);
4761 2590 : save_128_aligned ((__m128i*)(d + 32), xmm2);
4762 2590 : save_128_aligned ((__m128i*)(d + 48), xmm3);
4763 :
4764 2590 : s += 64;
4765 2590 : d += 64;
4766 2590 : w -= 64;
4767 : }
4768 :
4769 2190 : while (w >= 16)
4770 : {
4771 408 : save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4772 :
4773 204 : w -= 16;
4774 204 : d += 16;
4775 204 : s += 16;
4776 : }
4777 :
4778 2388 : while (w >= 4)
4779 : {
4780 402 : *(uint32_t *)d = *(uint32_t *)s;
4781 :
4782 402 : w -= 4;
4783 402 : s += 4;
4784 402 : d += 4;
4785 : }
4786 :
4787 993 : if (w >= 2)
4788 : {
4789 0 : *(uint16_t *)d = *(uint16_t *)s;
4790 0 : w -= 2;
4791 0 : s += 2;
4792 0 : d += 2;
4793 : }
4794 : }
4795 :
4796 :
4797 27 : return TRUE;
4798 : }
4799 :
4800 : static void
4801 27 : sse2_composite_copy_area (pixman_implementation_t *imp,
4802 : pixman_op_t op,
4803 : pixman_image_t * src_image,
4804 : pixman_image_t * mask_image,
4805 : pixman_image_t * dst_image,
4806 : int32_t src_x,
4807 : int32_t src_y,
4808 : int32_t mask_x,
4809 : int32_t mask_y,
4810 : int32_t dest_x,
4811 : int32_t dest_y,
4812 : int32_t width,
4813 : int32_t height)
4814 : {
4815 54 : pixman_blt_sse2 (src_image->bits.bits,
4816 : dst_image->bits.bits,
4817 : src_image->bits.rowstride,
4818 : dst_image->bits.rowstride,
4819 27 : PIXMAN_FORMAT_BPP (src_image->bits.format),
4820 27 : PIXMAN_FORMAT_BPP (dst_image->bits.format),
4821 : src_x, src_y, dest_x, dest_y, width, height);
4822 27 : }
4823 :
4824 : static void
4825 0 : sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4826 : pixman_op_t op,
4827 : pixman_image_t * src_image,
4828 : pixman_image_t * mask_image,
4829 : pixman_image_t * dst_image,
4830 : int32_t src_x,
4831 : int32_t src_y,
4832 : int32_t mask_x,
4833 : int32_t mask_y,
4834 : int32_t dest_x,
4835 : int32_t dest_y,
4836 : int32_t width,
4837 : int32_t height)
4838 : {
4839 : uint32_t *src, *src_line, s;
4840 : uint32_t *dst, *dst_line, d;
4841 : uint8_t *mask, *mask_line;
4842 : uint32_t m;
4843 : int src_stride, mask_stride, dst_stride;
4844 : int32_t w;
4845 : __m128i ms;
4846 :
4847 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4848 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4849 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850 :
4851 0 : PIXMAN_IMAGE_GET_LINE (
4852 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4853 0 : PIXMAN_IMAGE_GET_LINE (
4854 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4855 0 : PIXMAN_IMAGE_GET_LINE (
4856 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4857 :
4858 0 : while (height--)
4859 : {
4860 0 : src = src_line;
4861 0 : src_line += src_stride;
4862 0 : dst = dst_line;
4863 0 : dst_line += dst_stride;
4864 0 : mask = mask_line;
4865 0 : mask_line += mask_stride;
4866 :
4867 0 : w = width;
4868 :
4869 0 : while (w && (unsigned long)dst & 15)
4870 : {
4871 0 : s = 0xff000000 | *src++;
4872 0 : m = (uint32_t) *mask++;
4873 0 : d = *dst;
4874 0 : ms = unpack_32_1x128 (s);
4875 :
4876 0 : if (m != 0xff)
4877 : {
4878 0 : __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4879 0 : __m128i md = unpack_32_1x128 (d);
4880 :
4881 0 : ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4882 : }
4883 :
4884 0 : *dst++ = pack_1x128_32 (ms);
4885 0 : w--;
4886 : }
4887 :
4888 0 : while (w >= 4)
4889 : {
4890 0 : m = *(uint32_t*) mask;
4891 0 : xmm_src = _mm_or_si128 (
4892 : load_128_unaligned ((__m128i*)src), mask_ff000000);
4893 :
4894 0 : if (m == 0xffffffff)
4895 : {
4896 0 : save_128_aligned ((__m128i*)dst, xmm_src);
4897 : }
4898 : else
4899 : {
4900 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
4901 :
4902 0 : xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4903 :
4904 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4905 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4906 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4907 :
4908 0 : expand_alpha_rev_2x128 (
4909 : xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4910 :
4911 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4912 : &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4913 : &xmm_dst_lo, &xmm_dst_hi);
4914 :
4915 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4916 : }
4917 :
4918 0 : src += 4;
4919 0 : dst += 4;
4920 0 : mask += 4;
4921 0 : w -= 4;
4922 : }
4923 :
4924 0 : while (w)
4925 : {
4926 0 : m = (uint32_t) *mask++;
4927 :
4928 0 : if (m)
4929 : {
4930 0 : s = 0xff000000 | *src;
4931 :
4932 0 : if (m == 0xff)
4933 : {
4934 0 : *dst = s;
4935 : }
4936 : else
4937 : {
4938 : __m128i ma, md, ms;
4939 :
4940 0 : d = *dst;
4941 :
4942 0 : ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4943 0 : md = unpack_32_1x128 (d);
4944 0 : ms = unpack_32_1x128 (s);
4945 :
4946 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4947 : }
4948 :
4949 : }
4950 :
4951 0 : src++;
4952 0 : dst++;
4953 0 : w--;
4954 : }
4955 : }
4956 :
4957 0 : }
4958 :
4959 : static void
4960 0 : sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4961 : pixman_op_t op,
4962 : pixman_image_t * src_image,
4963 : pixman_image_t * mask_image,
4964 : pixman_image_t * dst_image,
4965 : int32_t src_x,
4966 : int32_t src_y,
4967 : int32_t mask_x,
4968 : int32_t mask_y,
4969 : int32_t dest_x,
4970 : int32_t dest_y,
4971 : int32_t width,
4972 : int32_t height)
4973 : {
4974 : uint32_t *src, *src_line, s;
4975 : uint32_t *dst, *dst_line, d;
4976 : uint8_t *mask, *mask_line;
4977 : uint32_t m;
4978 : int src_stride, mask_stride, dst_stride;
4979 : int32_t w;
4980 :
4981 : __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4982 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4983 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4984 :
4985 0 : PIXMAN_IMAGE_GET_LINE (
4986 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4987 0 : PIXMAN_IMAGE_GET_LINE (
4988 : mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4989 0 : PIXMAN_IMAGE_GET_LINE (
4990 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4991 :
4992 0 : while (height--)
4993 : {
4994 0 : src = src_line;
4995 0 : src_line += src_stride;
4996 0 : dst = dst_line;
4997 0 : dst_line += dst_stride;
4998 0 : mask = mask_line;
4999 0 : mask_line += mask_stride;
5000 :
5001 0 : w = width;
5002 :
5003 0 : while (w && (unsigned long)dst & 15)
5004 : {
5005 : uint32_t sa;
5006 :
5007 0 : s = *src++;
5008 0 : m = (uint32_t) *mask++;
5009 0 : d = *dst;
5010 :
5011 0 : sa = s >> 24;
5012 :
5013 0 : if (m)
5014 : {
5015 0 : if (sa == 0xff && m == 0xff)
5016 : {
5017 0 : *dst = s;
5018 : }
5019 : else
5020 : {
5021 : __m128i ms, md, ma, msa;
5022 :
5023 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5024 0 : ms = unpack_32_1x128 (s);
5025 0 : md = unpack_32_1x128 (d);
5026 :
5027 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5028 :
5029 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5030 : }
5031 : }
5032 :
5033 0 : dst++;
5034 0 : w--;
5035 : }
5036 :
5037 0 : while (w >= 4)
5038 : {
5039 0 : m = *(uint32_t *) mask;
5040 :
5041 0 : if (m)
5042 : {
5043 0 : xmm_src = load_128_unaligned ((__m128i*)src);
5044 :
5045 0 : if (m == 0xffffffff && is_opaque (xmm_src))
5046 : {
5047 0 : save_128_aligned ((__m128i *)dst, xmm_src);
5048 : }
5049 : else
5050 : {
5051 0 : xmm_dst = load_128_aligned ((__m128i *)dst);
5052 :
5053 0 : xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5054 :
5055 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5056 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5057 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5058 :
5059 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5060 0 : expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5061 :
5062 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5063 : &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5064 :
5065 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5066 : }
5067 : }
5068 :
5069 0 : src += 4;
5070 0 : dst += 4;
5071 0 : mask += 4;
5072 0 : w -= 4;
5073 : }
5074 :
5075 0 : while (w)
5076 : {
5077 : uint32_t sa;
5078 :
5079 0 : s = *src++;
5080 0 : m = (uint32_t) *mask++;
5081 0 : d = *dst;
5082 :
5083 0 : sa = s >> 24;
5084 :
5085 0 : if (m)
5086 : {
5087 0 : if (sa == 0xff && m == 0xff)
5088 : {
5089 0 : *dst = s;
5090 : }
5091 : else
5092 : {
5093 : __m128i ms, md, ma, msa;
5094 :
5095 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5096 0 : ms = unpack_32_1x128 (s);
5097 0 : md = unpack_32_1x128 (d);
5098 :
5099 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5100 :
5101 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5102 : }
5103 : }
5104 :
5105 0 : dst++;
5106 0 : w--;
5107 : }
5108 : }
5109 :
5110 0 : }
5111 :
5112 : static void
5113 0 : sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5114 : pixman_op_t op,
5115 : pixman_image_t * src_image,
5116 : pixman_image_t * mask_image,
5117 : pixman_image_t * dst_image,
5118 : int32_t src_x,
5119 : int32_t src_y,
5120 : int32_t mask_x,
5121 : int32_t mask_y,
5122 : int32_t dest_x,
5123 : int32_t dest_y,
5124 : int32_t width,
5125 : int32_t height)
5126 : {
5127 : uint32_t src;
5128 : uint32_t *dst_line, *dst;
5129 : __m128i xmm_src;
5130 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5131 : __m128i xmm_dsta_hi, xmm_dsta_lo;
5132 : int dst_stride;
5133 : int32_t w;
5134 :
5135 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5136 :
5137 0 : if (src == 0)
5138 0 : return;
5139 :
5140 0 : PIXMAN_IMAGE_GET_LINE (
5141 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5142 :
5143 0 : xmm_src = expand_pixel_32_1x128 (src);
5144 :
5145 0 : while (height--)
5146 : {
5147 0 : dst = dst_line;
5148 :
5149 0 : dst_line += dst_stride;
5150 0 : w = width;
5151 :
5152 0 : while (w && (unsigned long)dst & 15)
5153 : {
5154 : __m128i vd;
5155 :
5156 0 : vd = unpack_32_1x128 (*dst);
5157 :
5158 0 : *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5159 : xmm_src));
5160 0 : w--;
5161 0 : dst++;
5162 : }
5163 :
5164 0 : while (w >= 4)
5165 : {
5166 : __m128i tmp_lo, tmp_hi;
5167 :
5168 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
5169 :
5170 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5171 0 : expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5172 :
5173 0 : tmp_lo = xmm_src;
5174 0 : tmp_hi = xmm_src;
5175 :
5176 : over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5177 : &xmm_dsta_lo, &xmm_dsta_hi,
5178 : &tmp_lo, &tmp_hi);
5179 :
5180 0 : save_128_aligned (
5181 : (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5182 :
5183 0 : w -= 4;
5184 0 : dst += 4;
5185 : }
5186 :
5187 0 : while (w)
5188 : {
5189 : __m128i vd;
5190 :
5191 0 : vd = unpack_32_1x128 (*dst);
5192 :
5193 0 : *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5194 : xmm_src));
5195 0 : w--;
5196 0 : dst++;
5197 : }
5198 :
5199 : }
5200 :
5201 : }
5202 :
5203 : static void
5204 0 : sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5205 : pixman_op_t op,
5206 : pixman_image_t * src_image,
5207 : pixman_image_t * mask_image,
5208 : pixman_image_t * dst_image,
5209 : int32_t src_x,
5210 : int32_t src_y,
5211 : int32_t mask_x,
5212 : int32_t mask_y,
5213 : int32_t dest_x,
5214 : int32_t dest_y,
5215 : int32_t width,
5216 : int32_t height)
5217 : {
5218 : uint32_t *src, *src_line, s;
5219 : uint32_t *dst, *dst_line, d;
5220 : uint32_t *mask, *mask_line;
5221 : uint32_t m;
5222 : int src_stride, mask_stride, dst_stride;
5223 : int32_t w;
5224 :
5225 : __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5226 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5227 : __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5228 :
5229 0 : PIXMAN_IMAGE_GET_LINE (
5230 : dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5231 0 : PIXMAN_IMAGE_GET_LINE (
5232 : mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5233 0 : PIXMAN_IMAGE_GET_LINE (
5234 : src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5235 :
5236 0 : while (height--)
5237 : {
5238 0 : src = src_line;
5239 0 : src_line += src_stride;
5240 0 : dst = dst_line;
5241 0 : dst_line += dst_stride;
5242 0 : mask = mask_line;
5243 0 : mask_line += mask_stride;
5244 :
5245 0 : w = width;
5246 :
5247 0 : while (w && (unsigned long)dst & 15)
5248 : {
5249 : uint32_t sa;
5250 :
5251 0 : s = *src++;
5252 0 : m = (*mask++) >> 24;
5253 0 : d = *dst;
5254 :
5255 0 : sa = s >> 24;
5256 :
5257 0 : if (m)
5258 : {
5259 0 : if (sa == 0xff && m == 0xff)
5260 : {
5261 0 : *dst = s;
5262 : }
5263 : else
5264 : {
5265 : __m128i ms, md, ma, msa;
5266 :
5267 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5268 0 : ms = unpack_32_1x128 (s);
5269 0 : md = unpack_32_1x128 (d);
5270 :
5271 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5272 :
5273 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5274 : }
5275 : }
5276 :
5277 0 : dst++;
5278 0 : w--;
5279 : }
5280 :
5281 0 : while (w >= 4)
5282 : {
5283 0 : xmm_mask = load_128_unaligned ((__m128i*)mask);
5284 :
5285 0 : if (!is_transparent (xmm_mask))
5286 : {
5287 0 : xmm_src = load_128_unaligned ((__m128i*)src);
5288 :
5289 0 : if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5290 : {
5291 0 : save_128_aligned ((__m128i *)dst, xmm_src);
5292 : }
5293 : else
5294 : {
5295 0 : xmm_dst = load_128_aligned ((__m128i *)dst);
5296 :
5297 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5298 0 : unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5299 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5300 :
5301 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5302 0 : expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5303 :
5304 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5305 : &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5306 :
5307 0 : save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5308 : }
5309 : }
5310 :
5311 0 : src += 4;
5312 0 : dst += 4;
5313 0 : mask += 4;
5314 0 : w -= 4;
5315 : }
5316 :
5317 0 : while (w)
5318 : {
5319 : uint32_t sa;
5320 :
5321 0 : s = *src++;
5322 0 : m = (*mask++) >> 24;
5323 0 : d = *dst;
5324 :
5325 0 : sa = s >> 24;
5326 :
5327 0 : if (m)
5328 : {
5329 0 : if (sa == 0xff && m == 0xff)
5330 : {
5331 0 : *dst = s;
5332 : }
5333 : else
5334 : {
5335 : __m128i ms, md, ma, msa;
5336 :
5337 0 : ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5338 0 : ms = unpack_32_1x128 (s);
5339 0 : md = unpack_32_1x128 (d);
5340 :
5341 0 : msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5342 :
5343 0 : *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5344 : }
5345 : }
5346 :
5347 0 : dst++;
5348 0 : w--;
5349 : }
5350 : }
5351 :
5352 0 : }
5353 :
5354 : /* A variant of 'sse2_combine_over_u' with minor tweaks */
5355 : static force_inline void
5356 : scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5357 : const uint32_t* ps,
5358 : int32_t w,
5359 : pixman_fixed_t vx,
5360 : pixman_fixed_t unit_x,
5361 : pixman_fixed_t max_vx,
5362 : pixman_bool_t fully_transparent_src)
5363 : {
5364 : uint32_t s, d;
5365 0 : const uint32_t* pm = NULL;
5366 :
5367 : __m128i xmm_dst_lo, xmm_dst_hi;
5368 : __m128i xmm_src_lo, xmm_src_hi;
5369 : __m128i xmm_alpha_lo, xmm_alpha_hi;
5370 :
5371 0 : if (fully_transparent_src)
5372 : return;
5373 :
5374 : /* Align dst on a 16-byte boundary */
5375 0 : while (w && ((unsigned long)pd & 15))
5376 : {
5377 0 : d = *pd;
5378 0 : s = combine1 (ps + (vx >> 16), pm);
5379 0 : vx += unit_x;
5380 :
5381 0 : *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5382 0 : if (pm)
5383 0 : pm++;
5384 0 : w--;
5385 : }
5386 :
5387 0 : while (w >= 4)
5388 : {
5389 : __m128i tmp;
5390 : uint32_t tmp1, tmp2, tmp3, tmp4;
5391 :
5392 0 : tmp1 = ps[vx >> 16];
5393 0 : vx += unit_x;
5394 0 : tmp2 = ps[vx >> 16];
5395 0 : vx += unit_x;
5396 0 : tmp3 = ps[vx >> 16];
5397 0 : vx += unit_x;
5398 0 : tmp4 = ps[vx >> 16];
5399 0 : vx += unit_x;
5400 :
5401 0 : tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5402 :
5403 0 : xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5404 :
5405 0 : if (is_opaque (xmm_src_hi))
5406 : {
5407 0 : save_128_aligned ((__m128i*)pd, xmm_src_hi);
5408 : }
5409 0 : else if (!is_zero (xmm_src_hi))
5410 : {
5411 0 : xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5412 :
5413 0 : unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5414 0 : unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5415 :
5416 0 : expand_alpha_2x128 (
5417 : xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5418 :
5419 : over_2x128 (&xmm_src_lo, &xmm_src_hi,
5420 : &xmm_alpha_lo, &xmm_alpha_hi,
5421 : &xmm_dst_lo, &xmm_dst_hi);
5422 :
5423 : /* rebuid the 4 pixel data and save*/
5424 0 : save_128_aligned ((__m128i*)pd,
5425 : pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5426 : }
5427 :
5428 0 : w -= 4;
5429 0 : pd += 4;
5430 0 : if (pm)
5431 0 : pm += 4;
5432 : }
5433 :
5434 0 : while (w)
5435 : {
5436 0 : d = *pd;
5437 0 : s = combine1 (ps + (vx >> 16), pm);
5438 0 : vx += unit_x;
5439 :
5440 0 : *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5441 0 : if (pm)
5442 0 : pm++;
5443 :
5444 0 : w--;
5445 : }
5446 : }
5447 :
5448 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5449 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5450 0 : uint32_t, uint32_t, COVER)
5451 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5452 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5453 0 : uint32_t, uint32_t, NONE)
5454 0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5455 : scaled_nearest_scanline_sse2_8888_8888_OVER,
5456 0 : uint32_t, uint32_t, PAD)
5457 :
5458 : static force_inline void
5459 : scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5460 : uint32_t * dst,
5461 : const uint32_t * src,
5462 : int32_t w,
5463 : pixman_fixed_t vx,
5464 : pixman_fixed_t unit_x,
5465 : pixman_fixed_t max_vx,
5466 : pixman_bool_t zero_src)
5467 : {
5468 : __m128i xmm_mask;
5469 : __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5470 : __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5471 : __m128i xmm_alpha_lo, xmm_alpha_hi;
5472 :
5473 0 : if (zero_src || (*mask >> 24) == 0)
5474 : return;
5475 :
5476 0 : xmm_mask = create_mask_16_128 (*mask >> 24);
5477 :
5478 0 : while (w && (unsigned long)dst & 15)
5479 : {
5480 0 : uint32_t s = src[pixman_fixed_to_int (vx)];
5481 0 : vx += unit_x;
5482 :
5483 0 : if (s)
5484 : {
5485 0 : uint32_t d = *dst;
5486 :
5487 0 : __m128i ms = unpack_32_1x128 (s);
5488 0 : __m128i alpha = expand_alpha_1x128 (ms);
5489 0 : __m128i dest = xmm_mask;
5490 0 : __m128i alpha_dst = unpack_32_1x128 (d);
5491 :
5492 0 : *dst = pack_1x128_32 (
5493 : in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5494 : }
5495 0 : dst++;
5496 0 : w--;
5497 : }
5498 :
5499 0 : while (w >= 4)
5500 : {
5501 : uint32_t tmp1, tmp2, tmp3, tmp4;
5502 :
5503 0 : tmp1 = src[pixman_fixed_to_int (vx)];
5504 0 : vx += unit_x;
5505 0 : tmp2 = src[pixman_fixed_to_int (vx)];
5506 0 : vx += unit_x;
5507 0 : tmp3 = src[pixman_fixed_to_int (vx)];
5508 0 : vx += unit_x;
5509 0 : tmp4 = src[pixman_fixed_to_int (vx)];
5510 0 : vx += unit_x;
5511 :
5512 0 : xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5513 :
5514 0 : if (!is_zero (xmm_src))
5515 : {
5516 0 : xmm_dst = load_128_aligned ((__m128i*)dst);
5517 :
5518 0 : unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5519 0 : unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5520 0 : expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5521 : &xmm_alpha_lo, &xmm_alpha_hi);
5522 :
5523 : in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5524 : &xmm_alpha_lo, &xmm_alpha_hi,
5525 : &xmm_mask, &xmm_mask,
5526 : &xmm_dst_lo, &xmm_dst_hi);
5527 :
5528 0 : save_128_aligned (
5529 : (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5530 : }
5531 :
5532 0 : dst += 4;
5533 0 : w -= 4;
5534 : }
5535 :
5536 0 : while (w)
5537 : {
5538 0 : uint32_t s = src[pixman_fixed_to_int (vx)];
5539 0 : vx += unit_x;
5540 :
5541 0 : if (s)
5542 : {
5543 0 : uint32_t d = *dst;
5544 :
5545 0 : __m128i ms = unpack_32_1x128 (s);
5546 0 : __m128i alpha = expand_alpha_1x128 (ms);
5547 0 : __m128i mask = xmm_mask;
5548 0 : __m128i dest = unpack_32_1x128 (d);
5549 :
5550 0 : *dst = pack_1x128_32 (
5551 : in_over_1x128 (&ms, &alpha, &mask, &dest));
5552 : }
5553 :
5554 0 : dst++;
5555 0 : w--;
5556 : }
5557 :
5558 : }
5559 :
5560 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5561 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5562 0 : uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5563 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5564 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5565 0 : uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5566 0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5567 : scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5568 0 : uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5569 :
5570 : static void
5571 304 : bilinear_interpolate_line_sse2 (uint32_t * out,
5572 : const uint32_t * top,
5573 : const uint32_t * bottom,
5574 : int wt,
5575 : int wb,
5576 : pixman_fixed_t x,
5577 : pixman_fixed_t ux,
5578 : int width)
5579 : {
5580 608 : const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
5581 608 : const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
5582 304 : const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
5583 304 : const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
5584 608 : const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
5585 304 : const __m128i xmm_zero = _mm_setzero_si128 ();
5586 608 : __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
5587 : uint32_t pix1, pix2, pix3, pix4;
5588 :
5589 : #define INTERPOLATE_ONE_PIXEL(pix) \
5590 : do { \
5591 : __m128i xmm_wh, xmm_lo, xmm_hi, a; \
5592 : /* fetch 2x2 pixel block into sse2 register */ \
5593 : uint32_t tl = top [pixman_fixed_to_int (x)]; \
5594 : uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \
5595 : uint32_t bl = bottom [pixman_fixed_to_int (x)]; \
5596 : uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \
5597 : a = _mm_set_epi32 (tr, tl, br, bl); \
5598 : x += ux; \
5599 : /* vertical interpolation */ \
5600 : a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
5601 : xmm_wt), \
5602 : _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
5603 : xmm_wb)); \
5604 : /* calculate horizontal weights */ \
5605 : xmm_wh = _mm_add_epi16 (xmm_addc, \
5606 : _mm_xor_si128 (xmm_xorc, \
5607 : _mm_srli_epi16 (xmm_x, 8))); \
5608 : xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5609 : /* horizontal interpolation */ \
5610 : xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
5611 : xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
5612 : a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
5613 : _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
5614 : /* shift and pack the result */ \
5615 : a = _mm_srli_epi32 (a, 16); \
5616 : a = _mm_packs_epi32 (a, a); \
5617 : a = _mm_packus_epi16 (a, a); \
5618 : pix = _mm_cvtsi128_si32 (a); \
5619 : } while (0)
5620 :
5621 1360 : while ((width -= 4) >= 0)
5622 : {
5623 12032 : INTERPOLATE_ONE_PIXEL (pix1);
5624 12032 : INTERPOLATE_ONE_PIXEL (pix2);
5625 12032 : INTERPOLATE_ONE_PIXEL (pix3);
5626 12032 : INTERPOLATE_ONE_PIXEL (pix4);
5627 752 : *out++ = pix1;
5628 752 : *out++ = pix2;
5629 752 : *out++ = pix3;
5630 752 : *out++ = pix4;
5631 : }
5632 304 : if (width & 2)
5633 : {
5634 1792 : INTERPOLATE_ONE_PIXEL (pix1);
5635 1792 : INTERPOLATE_ONE_PIXEL (pix2);
5636 112 : *out++ = pix1;
5637 112 : *out++ = pix2;
5638 : }
5639 304 : if (width & 1)
5640 : {
5641 1536 : INTERPOLATE_ONE_PIXEL (pix1);
5642 96 : *out = pix1;
5643 : }
5644 :
5645 : #undef INTERPOLATE_ONE_PIXEL
5646 304 : }
5647 :
5648 : static force_inline void
5649 : scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5650 : const uint32_t * mask,
5651 : const uint32_t * src_top,
5652 : const uint32_t * src_bottom,
5653 : int32_t w,
5654 : int wt,
5655 : int wb,
5656 : pixman_fixed_t vx,
5657 : pixman_fixed_t unit_x,
5658 : pixman_fixed_t max_vx,
5659 : pixman_bool_t zero_src)
5660 : {
5661 304 : bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
5662 : wt, wb, vx, unit_x, w);
5663 : }
5664 :
5665 6 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5666 : scaled_bilinear_scanline_sse2_8888_8888_SRC,
5667 : uint32_t, uint32_t, uint32_t,
5668 114 : COVER, FALSE, FALSE)
5669 0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5670 : scaled_bilinear_scanline_sse2_8888_8888_SRC,
5671 : uint32_t, uint32_t, uint32_t,
5672 0 : PAD, FALSE, FALSE)
5673 300 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5674 : scaled_bilinear_scanline_sse2_8888_8888_SRC,
5675 : uint32_t, uint32_t, uint32_t,
5676 4 : NONE, FALSE, FALSE)
5677 :
5678 : static const pixman_fast_path_t sse2_fast_paths[] =
5679 : {
5680 : /* PIXMAN_OP_OVER */
5681 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5682 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5683 : PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5684 : PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5685 : PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5686 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5687 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5688 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5689 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5690 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5691 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5692 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5693 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5694 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5695 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5696 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5697 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5698 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5699 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5700 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5701 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5702 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5703 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5704 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5705 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5706 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5707 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5708 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5709 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5710 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5711 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5712 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5713 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5714 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5715 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5716 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5717 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5718 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5719 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5720 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5721 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5722 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5723 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5724 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5725 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5726 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5727 :
5728 : /* PIXMAN_OP_OVER_REVERSE */
5729 : PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5730 : PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5731 :
5732 : /* PIXMAN_OP_ADD */
5733 : PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5734 : PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5735 : PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5736 : PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5737 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5738 : PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5739 :
5740 : /* PIXMAN_OP_SRC */
5741 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5742 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5743 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5744 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5745 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5746 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5747 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5748 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5749 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5750 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5751 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5752 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5753 : PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5754 : PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5755 :
5756 : /* PIXMAN_OP_IN */
5757 : PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5758 : PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5759 : PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5760 :
5761 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5762 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5763 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5764 : SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5765 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5766 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5767 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5768 : SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5769 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5770 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5771 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5772 : SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5773 :
5774 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5775 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5776 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5777 : SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5778 :
5779 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5780 : SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5781 : SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
5782 :
5783 : { PIXMAN_OP_NONE },
5784 : };
5785 :
5786 : static pixman_bool_t
5787 0 : sse2_blt (pixman_implementation_t *imp,
5788 : uint32_t * src_bits,
5789 : uint32_t * dst_bits,
5790 : int src_stride,
5791 : int dst_stride,
5792 : int src_bpp,
5793 : int dst_bpp,
5794 : int src_x,
5795 : int src_y,
5796 : int dst_x,
5797 : int dst_y,
5798 : int width,
5799 : int height)
5800 : {
5801 0 : if (!pixman_blt_sse2 (
5802 : src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5803 : src_x, src_y, dst_x, dst_y, width, height))
5804 :
5805 : {
5806 0 : return _pixman_implementation_blt (
5807 : imp->delegate,
5808 : src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5809 : src_x, src_y, dst_x, dst_y, width, height);
5810 : }
5811 :
5812 0 : return TRUE;
5813 : }
5814 :
5815 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5816 : __attribute__((__force_align_arg_pointer__))
5817 : #endif
5818 : static pixman_bool_t
5819 17 : sse2_fill (pixman_implementation_t *imp,
5820 : uint32_t * bits,
5821 : int stride,
5822 : int bpp,
5823 : int x,
5824 : int y,
5825 : int width,
5826 : int height,
5827 : uint32_t xor)
5828 : {
5829 17 : if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5830 : {
5831 0 : return _pixman_implementation_fill (
5832 : imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5833 : }
5834 :
5835 17 : return TRUE;
5836 : }
5837 :
5838 : static uint32_t *
5839 0 : sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5840 : {
5841 0 : int w = iter->width;
5842 0 : __m128i ff000000 = mask_ff000000;
5843 0 : uint32_t *dst = iter->buffer;
5844 0 : uint32_t *src = (uint32_t *)iter->bits;
5845 :
5846 0 : iter->bits += iter->stride;
5847 :
5848 0 : while (w && ((unsigned long)dst) & 0x0f)
5849 : {
5850 0 : *dst++ = (*src++) | 0xff000000;
5851 0 : w--;
5852 : }
5853 :
5854 0 : while (w >= 4)
5855 : {
5856 0 : save_128_aligned (
5857 : (__m128i *)dst, _mm_or_si128 (
5858 : load_128_unaligned ((__m128i *)src), ff000000));
5859 :
5860 0 : dst += 4;
5861 0 : src += 4;
5862 0 : w -= 4;
5863 : }
5864 :
5865 0 : while (w)
5866 : {
5867 0 : *dst++ = (*src++) | 0xff000000;
5868 0 : w--;
5869 : }
5870 :
5871 0 : return iter->buffer;
5872 : }
5873 :
5874 : static uint32_t *
5875 0 : sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5876 : {
5877 0 : int w = iter->width;
5878 0 : uint32_t *dst = iter->buffer;
5879 0 : uint16_t *src = (uint16_t *)iter->bits;
5880 0 : __m128i ff000000 = mask_ff000000;
5881 :
5882 0 : iter->bits += iter->stride;
5883 :
5884 0 : while (w && ((unsigned long)dst) & 0x0f)
5885 : {
5886 0 : uint16_t s = *src++;
5887 :
5888 0 : *dst++ = CONVERT_0565_TO_8888 (s);
5889 0 : w--;
5890 : }
5891 :
5892 0 : while (w >= 8)
5893 : {
5894 : __m128i lo, hi, s;
5895 :
5896 0 : s = _mm_loadu_si128 ((__m128i *)src);
5897 :
5898 0 : lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5899 0 : hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5900 :
5901 0 : save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5902 0 : save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5903 :
5904 0 : dst += 8;
5905 0 : src += 8;
5906 0 : w -= 8;
5907 : }
5908 :
5909 0 : while (w)
5910 : {
5911 0 : uint16_t s = *src++;
5912 :
5913 0 : *dst++ = CONVERT_0565_TO_8888 (s);
5914 0 : w--;
5915 : }
5916 :
5917 0 : return iter->buffer;
5918 : }
5919 :
5920 : static uint32_t *
5921 0 : sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5922 : {
5923 0 : int w = iter->width;
5924 0 : uint32_t *dst = iter->buffer;
5925 0 : uint8_t *src = iter->bits;
5926 : __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5927 :
5928 0 : iter->bits += iter->stride;
5929 :
5930 0 : while (w && (((unsigned long)dst) & 15))
5931 : {
5932 0 : *dst++ = *(src++) << 24;
5933 0 : w--;
5934 : }
5935 :
5936 0 : while (w >= 16)
5937 : {
5938 0 : xmm0 = _mm_loadu_si128((__m128i *)src);
5939 :
5940 0 : xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
5941 0 : xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
5942 0 : xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5943 0 : xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5944 0 : xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5945 0 : xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5946 :
5947 0 : _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
5948 0 : _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
5949 0 : _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
5950 0 : _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5951 :
5952 0 : dst += 16;
5953 0 : src += 16;
5954 0 : w -= 16;
5955 : }
5956 :
5957 0 : while (w)
5958 : {
5959 0 : *dst++ = *(src++) << 24;
5960 0 : w--;
5961 : }
5962 :
5963 0 : return iter->buffer;
5964 : }
5965 :
5966 : typedef struct
5967 : {
5968 : pixman_format_code_t format;
5969 : pixman_iter_get_scanline_t get_scanline;
5970 : } fetcher_info_t;
5971 :
5972 : static const fetcher_info_t fetchers[] =
5973 : {
5974 : { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
5975 : { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
5976 : { PIXMAN_a8, sse2_fetch_a8 },
5977 : { PIXMAN_null }
5978 : };
5979 :
5980 : static void
5981 0 : sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
5982 : {
5983 0 : pixman_image_t *image = iter->image;
5984 0 : int x = iter->x;
5985 0 : int y = iter->y;
5986 0 : int width = iter->width;
5987 0 : int height = iter->height;
5988 :
5989 : #define FLAGS \
5990 : (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
5991 :
5992 0 : if ((iter->flags & ITER_NARROW) &&
5993 0 : (image->common.flags & FLAGS) == FLAGS &&
5994 0 : x >= 0 && y >= 0 &&
5995 0 : x + width <= image->bits.width &&
5996 0 : y + height <= image->bits.height)
5997 : {
5998 : const fetcher_info_t *f;
5999 :
6000 0 : for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6001 : {
6002 0 : if (image->common.extended_format_code == f->format)
6003 : {
6004 0 : uint8_t *b = (uint8_t *)image->bits.bits;
6005 0 : int s = image->bits.rowstride * 4;
6006 :
6007 0 : iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6008 0 : iter->stride = s;
6009 :
6010 0 : iter->get_scanline = f->get_scanline;
6011 0 : return;
6012 : }
6013 : }
6014 : }
6015 :
6016 0 : imp->delegate->src_iter_init (imp->delegate, iter);
6017 : }
6018 :
6019 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6020 : __attribute__((__force_align_arg_pointer__))
6021 : #endif
6022 : pixman_implementation_t *
6023 4 : _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6024 : {
6025 4 : pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6026 :
6027 : /* SSE2 constants */
6028 4 : mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6029 4 : mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6030 4 : mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6031 4 : mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6032 4 : mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6033 4 : mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6034 4 : mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6035 4 : mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6036 4 : mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6037 4 : mask_0080 = create_mask_16_128 (0x0080);
6038 4 : mask_00ff = create_mask_16_128 (0x00ff);
6039 4 : mask_0101 = create_mask_16_128 (0x0101);
6040 4 : mask_ffff = create_mask_16_128 (0xffff);
6041 4 : mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6042 4 : mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6043 :
6044 : /* Set up function pointers */
6045 4 : imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6046 4 : imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6047 4 : imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6048 4 : imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6049 4 : imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6050 4 : imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6051 4 : imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6052 4 : imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6053 4 : imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6054 4 : imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6055 :
6056 4 : imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6057 :
6058 4 : imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6059 4 : imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6060 4 : imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6061 4 : imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6062 4 : imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6063 4 : imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6064 4 : imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6065 4 : imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6066 4 : imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6067 4 : imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6068 4 : imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6069 :
6070 4 : imp->blt = sse2_blt;
6071 4 : imp->fill = sse2_fill;
6072 :
6073 4 : imp->src_iter_init = sse2_src_iter_init;
6074 :
6075 4 : return imp;
6076 : }
|