1 : /*
2 : * Copyright © 2004, 2005 Red Hat, Inc.
3 : * Copyright © 2004 Nicholas Miell
4 : * Copyright © 2005 Trolltech AS
5 : *
6 : * Permission to use, copy, modify, distribute, and sell this software and its
7 : * documentation for any purpose is hereby granted without fee, provided that
8 : * the above copyright notice appear in all copies and that both that
9 : * copyright notice and this permission notice appear in supporting
10 : * documentation, and that the name of Red Hat not be used in advertising or
11 : * publicity pertaining to distribution of the software without specific,
12 : * written prior permission. Red Hat makes no representations about the
13 : * suitability of this software for any purpose. It is provided "as is"
14 : * without express or implied warranty.
15 : *
16 : * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 : * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 : * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 : * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 : * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 : * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 : * SOFTWARE.
24 : *
25 : * Author: Søren Sandmann (sandmann@redhat.com)
26 : * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 : * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 : *
29 : * Based on work by Owen Taylor
30 : */
31 :
32 : #ifdef HAVE_CONFIG_H
33 : #include <config.h>
34 : #endif
35 :
36 : #ifdef USE_MMX
37 :
38 : #include <mmintrin.h>
39 : #include "pixman-private.h"
40 : #include "pixman-combine32.h"
41 :
42 : #define no_vERBOSE
43 :
44 : #ifdef VERBOSE
45 : #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
46 : #else
47 : #define CHECKPOINT()
48 : #endif
49 :
50 : /* Notes about writing mmx code
51 : *
52 : * give memory operands as the second operand. If you give it as the
53 : * first, gcc will first load it into a register, then use that
54 : * register
55 : *
56 : * ie. use
57 : *
58 : * _mm_mullo_pi16 (x, mmx_constant);
59 : *
60 : * not
61 : *
62 : * _mm_mullo_pi16 (mmx_constant, x);
63 : *
64 : * Also try to minimize dependencies. i.e. when you need a value, try
65 : * to calculate it from a value that was calculated as early as
66 : * possible.
67 : */
68 :
69 : /* --------------- MMX primitives ------------------------------------- */
70 :
71 : #ifdef __GNUC__
72 : typedef uint64_t mmxdatafield;
73 : #else
74 : typedef __m64 mmxdatafield;
75 : /* If __m64 is defined as a struct or union, define M64_MEMBER to be the
76 : name of the member used to access the data */
77 : # ifdef _MSC_VER
78 : # define M64_MEMBER m64_u64
79 : # elif defined(__SUNPRO_C)
80 : # define M64_MEMBER l_
81 : # endif
82 : #endif
83 :
84 : typedef struct
85 : {
86 : mmxdatafield mmx_4x00ff;
87 : mmxdatafield mmx_4x0080;
88 : mmxdatafield mmx_565_rgb;
89 : mmxdatafield mmx_565_unpack_multiplier;
90 : mmxdatafield mmx_565_r;
91 : mmxdatafield mmx_565_g;
92 : mmxdatafield mmx_565_b;
93 : mmxdatafield mmx_mask_0;
94 : mmxdatafield mmx_mask_1;
95 : mmxdatafield mmx_mask_2;
96 : mmxdatafield mmx_mask_3;
97 : mmxdatafield mmx_full_alpha;
98 : mmxdatafield mmx_ffff0000ffff0000;
99 : mmxdatafield mmx_0000ffff00000000;
100 : mmxdatafield mmx_000000000000ffff;
101 : } mmx_data_t;
102 :
103 : #if defined(_MSC_VER)
104 : # define MMXDATA_INIT(field, val) { val ## UI64 }
105 : #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
106 : # define MMXDATA_INIT(field, val) field = { val ## ULL }
107 : #else /* __m64 is an integral type */
108 : # define MMXDATA_INIT(field, val) field = val ## ULL
109 : #endif
110 :
111 : static const mmx_data_t c =
112 : {
113 : MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
114 : MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
115 : MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
116 : MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
117 : MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
118 : MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
119 : MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
120 : MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
121 : MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
122 : MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
123 : MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
124 : MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
125 : MMXDATA_INIT (.mmx_ffff0000ffff0000, 0xffff0000ffff0000),
126 : MMXDATA_INIT (.mmx_0000ffff00000000, 0x0000ffff00000000),
127 : MMXDATA_INIT (.mmx_000000000000ffff, 0x000000000000ffff),
128 : };
129 :
130 : #ifdef __GNUC__
131 : # ifdef __ICC
132 : # define MC(x) to_m64 (c.mmx_ ## x)
133 : # else
134 : # define MC(x) ((__m64)c.mmx_ ## x)
135 : # endif
136 : #else
137 : # define MC(x) c.mmx_ ## x
138 : #endif
139 :
140 : static force_inline __m64
141 : to_m64 (uint64_t x)
142 : {
143 : #ifdef __ICC
144 : return _mm_cvtsi64_m64 (x);
145 : #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
146 : __m64 res;
147 :
148 : res.M64_MEMBER = x;
149 : return res;
150 : #else /* __m64 is an integral type */
151 0 : return (__m64)x;
152 : #endif
153 : }
154 :
155 : static force_inline uint64_t
156 : to_uint64 (__m64 x)
157 : {
158 : #ifdef __ICC
159 : return _mm_cvtm64_si64 (x);
160 : #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
161 : uint64_t res = x.M64_MEMBER;
162 : return res;
163 : #else /* __m64 is an integral type */
164 0 : return (uint64_t)x;
165 : #endif
166 : }
167 :
168 : static force_inline __m64
169 : shift (__m64 v,
170 : int s)
171 : {
172 0 : if (s > 0)
173 : return _mm_slli_si64 (v, s);
174 0 : else if (s < 0)
175 0 : return _mm_srli_si64 (v, -s);
176 : else
177 0 : return v;
178 : }
179 :
180 : static force_inline __m64
181 : negate (__m64 mask)
182 : {
183 0 : return _mm_xor_si64 (mask, MC (4x00ff));
184 : }
185 :
186 : static force_inline __m64
187 : pix_multiply (__m64 a, __m64 b)
188 : {
189 : __m64 res;
190 :
191 0 : res = _mm_mullo_pi16 (a, b);
192 0 : res = _mm_adds_pu16 (res, MC (4x0080));
193 0 : res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
194 0 : res = _mm_srli_pi16 (res, 8);
195 :
196 0 : return res;
197 : }
198 :
199 : static force_inline __m64
200 : pix_add (__m64 a, __m64 b)
201 : {
202 : return _mm_adds_pu8 (a, b);
203 : }
204 :
205 : static force_inline __m64
206 : expand_alpha (__m64 pixel)
207 : {
208 : __m64 t1, t2;
209 :
210 0 : t1 = shift (pixel, -48);
211 0 : t2 = shift (t1, 16);
212 0 : t1 = _mm_or_si64 (t1, t2);
213 0 : t2 = shift (t1, 32);
214 0 : t1 = _mm_or_si64 (t1, t2);
215 :
216 0 : return t1;
217 : }
218 :
219 : static force_inline __m64
220 : expand_alpha_rev (__m64 pixel)
221 : {
222 : __m64 t1, t2;
223 :
224 : /* move alpha to low 16 bits and zero the rest */
225 0 : t1 = shift (pixel, 48);
226 0 : t1 = shift (t1, -48);
227 :
228 0 : t2 = shift (t1, 16);
229 0 : t1 = _mm_or_si64 (t1, t2);
230 0 : t2 = shift (t1, 32);
231 0 : t1 = _mm_or_si64 (t1, t2);
232 :
233 0 : return t1;
234 : }
235 :
236 : static force_inline __m64
237 : invert_colors (__m64 pixel)
238 : {
239 : __m64 x, y, z;
240 :
241 0 : x = y = z = pixel;
242 :
243 0 : x = _mm_and_si64 (x, MC (ffff0000ffff0000));
244 0 : y = _mm_and_si64 (y, MC (000000000000ffff));
245 0 : z = _mm_and_si64 (z, MC (0000ffff00000000));
246 :
247 0 : y = shift (y, 32);
248 0 : z = shift (z, -32);
249 :
250 0 : x = _mm_or_si64 (x, y);
251 0 : x = _mm_or_si64 (x, z);
252 :
253 0 : return x;
254 : }
255 :
256 : static force_inline __m64
257 : over (__m64 src,
258 : __m64 srca,
259 : __m64 dest)
260 : {
261 : return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
262 : }
263 :
264 : static force_inline __m64
265 : over_rev_non_pre (__m64 src, __m64 dest)
266 : {
267 0 : __m64 srca = expand_alpha (src);
268 0 : __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
269 :
270 0 : return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
271 : }
272 :
273 : static force_inline __m64
274 : in (__m64 src, __m64 mask)
275 : {
276 : return pix_multiply (src, mask);
277 : }
278 :
279 : static force_inline __m64
280 : in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
281 : {
282 : src = _mm_or_si64 (src, MC (full_alpha));
283 :
284 : return over (in (src, mask), mask, dest);
285 : }
286 :
287 : #ifndef _MSC_VER
288 : static force_inline __m64
289 : in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
290 : {
291 : return over (in (src, mask), pix_multiply (srca, mask), dest);
292 : }
293 :
294 : #else
295 :
296 : #define in_over(src, srca, mask, dest) \
297 : over (in (src, mask), pix_multiply (srca, mask), dest)
298 :
299 : #endif
300 :
301 : static force_inline __m64
302 : load8888 (uint32_t v)
303 : {
304 0 : return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
305 : }
306 :
307 : static force_inline __m64
308 : pack8888 (__m64 lo, __m64 hi)
309 : {
310 : return _mm_packs_pu16 (lo, hi);
311 : }
312 :
313 : static force_inline uint32_t
314 : store8888 (__m64 v)
315 : {
316 0 : return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
317 : }
318 :
319 : /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
320 : *
321 : * 00RR00GG00BB
322 : *
323 : * --- Expanding 565 in the low word ---
324 : *
325 : * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
326 : * m = m & (01f0003f001f);
327 : * m = m * (008404100840);
328 : * m = m >> 8;
329 : *
330 : * Note the trick here - the top word is shifted by another nibble to
331 : * avoid it bumping into the middle word
332 : */
333 : static force_inline __m64
334 : expand565 (__m64 pixel, int pos)
335 : {
336 0 : __m64 p = pixel;
337 : __m64 t1, t2;
338 :
339 : /* move pixel to low 16 bit and zero the rest */
340 0 : p = shift (shift (p, (3 - pos) * 16), -48);
341 :
342 0 : t1 = shift (p, 36 - 11);
343 0 : t2 = shift (p, 16 - 5);
344 :
345 0 : p = _mm_or_si64 (t1, p);
346 0 : p = _mm_or_si64 (t2, p);
347 0 : p = _mm_and_si64 (p, MC (565_rgb));
348 :
349 0 : pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
350 : return _mm_srli_pi16 (pixel, 8);
351 : }
352 :
353 : static force_inline __m64
354 : expand8888 (__m64 in, int pos)
355 : {
356 0 : if (pos == 0)
357 : return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
358 : else
359 : return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
360 : }
361 :
362 : static force_inline __m64
363 : expandx888 (__m64 in, int pos)
364 : {
365 0 : return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
366 : }
367 :
368 : static force_inline __m64
369 : pack_565 (__m64 pixel, __m64 target, int pos)
370 : {
371 0 : __m64 p = pixel;
372 0 : __m64 t = target;
373 : __m64 r, g, b;
374 :
375 0 : r = _mm_and_si64 (p, MC (565_r));
376 0 : g = _mm_and_si64 (p, MC (565_g));
377 0 : b = _mm_and_si64 (p, MC (565_b));
378 :
379 0 : r = shift (r, -(32 - 8) + pos * 16);
380 0 : g = shift (g, -(16 - 3) + pos * 16);
381 0 : b = shift (b, -(0 + 3) + pos * 16);
382 :
383 0 : if (pos == 0)
384 0 : t = _mm_and_si64 (t, MC (mask_0));
385 0 : else if (pos == 1)
386 0 : t = _mm_and_si64 (t, MC (mask_1));
387 0 : else if (pos == 2)
388 0 : t = _mm_and_si64 (t, MC (mask_2));
389 0 : else if (pos == 3)
390 0 : t = _mm_and_si64 (t, MC (mask_3));
391 :
392 0 : p = _mm_or_si64 (r, t);
393 0 : p = _mm_or_si64 (g, p);
394 :
395 0 : return _mm_or_si64 (b, p);
396 : }
397 :
398 : #ifndef _MSC_VER
399 :
400 : static force_inline __m64
401 : pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
402 : {
403 : x = pix_multiply (x, a);
404 : y = pix_multiply (y, b);
405 :
406 : return pix_add (x, y);
407 : }
408 :
409 : #else
410 :
411 : #define pix_add_mul(x, a, y, b) \
412 : ( x = pix_multiply (x, a), \
413 : y = pix_multiply (y, a), \
414 : pix_add (x, y) )
415 :
416 : #endif
417 :
418 : /* --------------- MMX code patch for fbcompose.c --------------------- */
419 :
420 : static force_inline uint32_t
421 : combine (const uint32_t *src, const uint32_t *mask)
422 : {
423 0 : uint32_t ssrc = *src;
424 :
425 0 : if (mask)
426 : {
427 0 : __m64 m = load8888 (*mask);
428 0 : __m64 s = load8888 (ssrc);
429 :
430 0 : m = expand_alpha (m);
431 0 : s = pix_multiply (s, m);
432 :
433 0 : ssrc = store8888 (s);
434 : }
435 :
436 0 : return ssrc;
437 : }
438 :
439 : static void
440 0 : mmx_combine_over_u (pixman_implementation_t *imp,
441 : pixman_op_t op,
442 : uint32_t * dest,
443 : const uint32_t * src,
444 : const uint32_t * mask,
445 : int width)
446 : {
447 0 : const uint32_t *end = dest + width;
448 :
449 0 : while (dest < end)
450 : {
451 : uint32_t ssrc = combine (src, mask);
452 0 : uint32_t a = ssrc >> 24;
453 :
454 0 : if (a == 0xff)
455 : {
456 0 : *dest = ssrc;
457 : }
458 0 : else if (ssrc)
459 : {
460 : __m64 s, sa;
461 0 : s = load8888 (ssrc);
462 0 : sa = expand_alpha (s);
463 0 : *dest = store8888 (over (s, sa, load8888 (*dest)));
464 : }
465 :
466 0 : ++dest;
467 0 : ++src;
468 0 : if (mask)
469 0 : ++mask;
470 : }
471 : _mm_empty ();
472 0 : }
473 :
474 : static void
475 0 : mmx_combine_over_reverse_u (pixman_implementation_t *imp,
476 : pixman_op_t op,
477 : uint32_t * dest,
478 : const uint32_t * src,
479 : const uint32_t * mask,
480 : int width)
481 : {
482 0 : const uint32_t *end = dest + width;
483 :
484 0 : while (dest < end)
485 : {
486 : __m64 d, da;
487 : uint32_t s = combine (src, mask);
488 :
489 0 : d = load8888 (*dest);
490 0 : da = expand_alpha (d);
491 0 : *dest = store8888 (over (d, da, load8888 (s)));
492 :
493 0 : ++dest;
494 0 : ++src;
495 0 : if (mask)
496 0 : mask++;
497 : }
498 : _mm_empty ();
499 0 : }
500 :
501 : static void
502 0 : mmx_combine_in_u (pixman_implementation_t *imp,
503 : pixman_op_t op,
504 : uint32_t * dest,
505 : const uint32_t * src,
506 : const uint32_t * mask,
507 : int width)
508 : {
509 0 : const uint32_t *end = dest + width;
510 :
511 0 : while (dest < end)
512 : {
513 : __m64 x, a;
514 :
515 0 : x = load8888 (combine (src, mask));
516 0 : a = load8888 (*dest);
517 0 : a = expand_alpha (a);
518 0 : x = pix_multiply (x, a);
519 :
520 0 : *dest = store8888 (x);
521 :
522 0 : ++dest;
523 0 : ++src;
524 0 : if (mask)
525 0 : mask++;
526 : }
527 : _mm_empty ();
528 0 : }
529 :
530 : static void
531 0 : mmx_combine_in_reverse_u (pixman_implementation_t *imp,
532 : pixman_op_t op,
533 : uint32_t * dest,
534 : const uint32_t * src,
535 : const uint32_t * mask,
536 : int width)
537 : {
538 0 : const uint32_t *end = dest + width;
539 :
540 0 : while (dest < end)
541 : {
542 : __m64 x, a;
543 :
544 0 : x = load8888 (*dest);
545 0 : a = load8888 (combine (src, mask));
546 0 : a = expand_alpha (a);
547 0 : x = pix_multiply (x, a);
548 0 : *dest = store8888 (x);
549 :
550 0 : ++dest;
551 0 : ++src;
552 0 : if (mask)
553 0 : mask++;
554 : }
555 : _mm_empty ();
556 0 : }
557 :
558 : static void
559 0 : mmx_combine_out_u (pixman_implementation_t *imp,
560 : pixman_op_t op,
561 : uint32_t * dest,
562 : const uint32_t * src,
563 : const uint32_t * mask,
564 : int width)
565 : {
566 0 : const uint32_t *end = dest + width;
567 :
568 0 : while (dest < end)
569 : {
570 : __m64 x, a;
571 :
572 0 : x = load8888 (combine (src, mask));
573 0 : a = load8888 (*dest);
574 0 : a = expand_alpha (a);
575 0 : a = negate (a);
576 0 : x = pix_multiply (x, a);
577 0 : *dest = store8888 (x);
578 :
579 0 : ++dest;
580 0 : ++src;
581 0 : if (mask)
582 0 : mask++;
583 : }
584 : _mm_empty ();
585 0 : }
586 :
587 : static void
588 0 : mmx_combine_out_reverse_u (pixman_implementation_t *imp,
589 : pixman_op_t op,
590 : uint32_t * dest,
591 : const uint32_t * src,
592 : const uint32_t * mask,
593 : int width)
594 : {
595 0 : const uint32_t *end = dest + width;
596 :
597 0 : while (dest < end)
598 : {
599 : __m64 x, a;
600 :
601 0 : x = load8888 (*dest);
602 0 : a = load8888 (combine (src, mask));
603 0 : a = expand_alpha (a);
604 0 : a = negate (a);
605 0 : x = pix_multiply (x, a);
606 :
607 0 : *dest = store8888 (x);
608 :
609 0 : ++dest;
610 0 : ++src;
611 0 : if (mask)
612 0 : mask++;
613 : }
614 : _mm_empty ();
615 0 : }
616 :
617 : static void
618 0 : mmx_combine_atop_u (pixman_implementation_t *imp,
619 : pixman_op_t op,
620 : uint32_t * dest,
621 : const uint32_t * src,
622 : const uint32_t * mask,
623 : int width)
624 : {
625 0 : const uint32_t *end = dest + width;
626 :
627 0 : while (dest < end)
628 : {
629 : __m64 s, da, d, sia;
630 :
631 0 : s = load8888 (combine (src, mask));
632 0 : d = load8888 (*dest);
633 0 : sia = expand_alpha (s);
634 0 : sia = negate (sia);
635 0 : da = expand_alpha (d);
636 0 : s = pix_add_mul (s, da, d, sia);
637 0 : *dest = store8888 (s);
638 :
639 0 : ++dest;
640 0 : ++src;
641 0 : if (mask)
642 0 : mask++;
643 : }
644 : _mm_empty ();
645 0 : }
646 :
647 : static void
648 0 : mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
649 : pixman_op_t op,
650 : uint32_t * dest,
651 : const uint32_t * src,
652 : const uint32_t * mask,
653 : int width)
654 : {
655 : const uint32_t *end;
656 :
657 0 : end = dest + width;
658 :
659 0 : while (dest < end)
660 : {
661 : __m64 s, dia, d, sa;
662 :
663 0 : s = load8888 (combine (src, mask));
664 0 : d = load8888 (*dest);
665 0 : sa = expand_alpha (s);
666 0 : dia = expand_alpha (d);
667 0 : dia = negate (dia);
668 0 : s = pix_add_mul (s, dia, d, sa);
669 0 : *dest = store8888 (s);
670 :
671 0 : ++dest;
672 0 : ++src;
673 0 : if (mask)
674 0 : mask++;
675 : }
676 : _mm_empty ();
677 0 : }
678 :
679 : static void
680 0 : mmx_combine_xor_u (pixman_implementation_t *imp,
681 : pixman_op_t op,
682 : uint32_t * dest,
683 : const uint32_t * src,
684 : const uint32_t * mask,
685 : int width)
686 : {
687 0 : const uint32_t *end = dest + width;
688 :
689 0 : while (dest < end)
690 : {
691 : __m64 s, dia, d, sia;
692 :
693 0 : s = load8888 (combine (src, mask));
694 0 : d = load8888 (*dest);
695 0 : sia = expand_alpha (s);
696 0 : dia = expand_alpha (d);
697 0 : sia = negate (sia);
698 0 : dia = negate (dia);
699 0 : s = pix_add_mul (s, dia, d, sia);
700 0 : *dest = store8888 (s);
701 :
702 0 : ++dest;
703 0 : ++src;
704 0 : if (mask)
705 0 : mask++;
706 : }
707 : _mm_empty ();
708 0 : }
709 :
710 : static void
711 0 : mmx_combine_add_u (pixman_implementation_t *imp,
712 : pixman_op_t op,
713 : uint32_t * dest,
714 : const uint32_t * src,
715 : const uint32_t * mask,
716 : int width)
717 : {
718 0 : const uint32_t *end = dest + width;
719 :
720 0 : while (dest < end)
721 : {
722 : __m64 s, d;
723 :
724 0 : s = load8888 (combine (src, mask));
725 0 : d = load8888 (*dest);
726 0 : s = pix_add (s, d);
727 0 : *dest = store8888 (s);
728 :
729 0 : ++dest;
730 0 : ++src;
731 0 : if (mask)
732 0 : mask++;
733 : }
734 : _mm_empty ();
735 0 : }
736 :
737 : static void
738 0 : mmx_combine_saturate_u (pixman_implementation_t *imp,
739 : pixman_op_t op,
740 : uint32_t * dest,
741 : const uint32_t * src,
742 : const uint32_t * mask,
743 : int width)
744 : {
745 0 : const uint32_t *end = dest + width;
746 :
747 0 : while (dest < end)
748 : {
749 : uint32_t s = combine (src, mask);
750 0 : uint32_t d = *dest;
751 0 : __m64 ms = load8888 (s);
752 0 : __m64 md = load8888 (d);
753 0 : uint32_t sa = s >> 24;
754 0 : uint32_t da = ~d >> 24;
755 :
756 0 : if (sa > da)
757 : {
758 0 : __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
759 0 : msa = expand_alpha (msa);
760 0 : ms = pix_multiply (ms, msa);
761 : }
762 :
763 0 : md = pix_add (md, ms);
764 0 : *dest = store8888 (md);
765 :
766 0 : ++src;
767 0 : ++dest;
768 0 : if (mask)
769 0 : mask++;
770 : }
771 : _mm_empty ();
772 0 : }
773 :
774 : static void
775 0 : mmx_combine_src_ca (pixman_implementation_t *imp,
776 : pixman_op_t op,
777 : uint32_t * dest,
778 : const uint32_t * src,
779 : const uint32_t * mask,
780 : int width)
781 : {
782 0 : const uint32_t *end = src + width;
783 :
784 0 : while (src < end)
785 : {
786 0 : __m64 a = load8888 (*mask);
787 0 : __m64 s = load8888 (*src);
788 :
789 0 : s = pix_multiply (s, a);
790 0 : *dest = store8888 (s);
791 :
792 0 : ++src;
793 0 : ++mask;
794 0 : ++dest;
795 : }
796 : _mm_empty ();
797 0 : }
798 :
799 : static void
800 0 : mmx_combine_over_ca (pixman_implementation_t *imp,
801 : pixman_op_t op,
802 : uint32_t * dest,
803 : const uint32_t * src,
804 : const uint32_t * mask,
805 : int width)
806 : {
807 0 : const uint32_t *end = src + width;
808 :
809 0 : while (src < end)
810 : {
811 0 : __m64 a = load8888 (*mask);
812 0 : __m64 s = load8888 (*src);
813 0 : __m64 d = load8888 (*dest);
814 0 : __m64 sa = expand_alpha (s);
815 :
816 0 : *dest = store8888 (in_over (s, sa, a, d));
817 :
818 0 : ++src;
819 0 : ++dest;
820 0 : ++mask;
821 : }
822 : _mm_empty ();
823 0 : }
824 :
825 : static void
826 0 : mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
827 : pixman_op_t op,
828 : uint32_t * dest,
829 : const uint32_t * src,
830 : const uint32_t * mask,
831 : int width)
832 : {
833 0 : const uint32_t *end = src + width;
834 :
835 0 : while (src < end)
836 : {
837 0 : __m64 a = load8888 (*mask);
838 0 : __m64 s = load8888 (*src);
839 0 : __m64 d = load8888 (*dest);
840 0 : __m64 da = expand_alpha (d);
841 :
842 0 : *dest = store8888 (over (d, da, in (s, a)));
843 :
844 0 : ++src;
845 0 : ++dest;
846 0 : ++mask;
847 : }
848 : _mm_empty ();
849 0 : }
850 :
851 : static void
852 0 : mmx_combine_in_ca (pixman_implementation_t *imp,
853 : pixman_op_t op,
854 : uint32_t * dest,
855 : const uint32_t * src,
856 : const uint32_t * mask,
857 : int width)
858 : {
859 0 : const uint32_t *end = src + width;
860 :
861 0 : while (src < end)
862 : {
863 0 : __m64 a = load8888 (*mask);
864 0 : __m64 s = load8888 (*src);
865 0 : __m64 d = load8888 (*dest);
866 0 : __m64 da = expand_alpha (d);
867 :
868 0 : s = pix_multiply (s, a);
869 0 : s = pix_multiply (s, da);
870 0 : *dest = store8888 (s);
871 :
872 0 : ++src;
873 0 : ++dest;
874 0 : ++mask;
875 : }
876 : _mm_empty ();
877 0 : }
878 :
879 : static void
880 0 : mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
881 : pixman_op_t op,
882 : uint32_t * dest,
883 : const uint32_t * src,
884 : const uint32_t * mask,
885 : int width)
886 : {
887 0 : const uint32_t *end = src + width;
888 :
889 0 : while (src < end)
890 : {
891 0 : __m64 a = load8888 (*mask);
892 0 : __m64 s = load8888 (*src);
893 0 : __m64 d = load8888 (*dest);
894 0 : __m64 sa = expand_alpha (s);
895 :
896 0 : a = pix_multiply (a, sa);
897 0 : d = pix_multiply (d, a);
898 0 : *dest = store8888 (d);
899 :
900 0 : ++src;
901 0 : ++dest;
902 0 : ++mask;
903 : }
904 : _mm_empty ();
905 0 : }
906 :
907 : static void
908 0 : mmx_combine_out_ca (pixman_implementation_t *imp,
909 : pixman_op_t op,
910 : uint32_t * dest,
911 : const uint32_t * src,
912 : const uint32_t * mask,
913 : int width)
914 : {
915 0 : const uint32_t *end = src + width;
916 :
917 0 : while (src < end)
918 : {
919 0 : __m64 a = load8888 (*mask);
920 0 : __m64 s = load8888 (*src);
921 0 : __m64 d = load8888 (*dest);
922 0 : __m64 da = expand_alpha (d);
923 :
924 0 : da = negate (da);
925 0 : s = pix_multiply (s, a);
926 0 : s = pix_multiply (s, da);
927 0 : *dest = store8888 (s);
928 :
929 0 : ++src;
930 0 : ++dest;
931 0 : ++mask;
932 : }
933 : _mm_empty ();
934 0 : }
935 :
936 : static void
937 0 : mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
938 : pixman_op_t op,
939 : uint32_t * dest,
940 : const uint32_t * src,
941 : const uint32_t * mask,
942 : int width)
943 : {
944 0 : const uint32_t *end = src + width;
945 :
946 0 : while (src < end)
947 : {
948 0 : __m64 a = load8888 (*mask);
949 0 : __m64 s = load8888 (*src);
950 0 : __m64 d = load8888 (*dest);
951 0 : __m64 sa = expand_alpha (s);
952 :
953 0 : a = pix_multiply (a, sa);
954 0 : a = negate (a);
955 0 : d = pix_multiply (d, a);
956 0 : *dest = store8888 (d);
957 :
958 0 : ++src;
959 0 : ++dest;
960 0 : ++mask;
961 : }
962 : _mm_empty ();
963 0 : }
964 :
965 : static void
966 0 : mmx_combine_atop_ca (pixman_implementation_t *imp,
967 : pixman_op_t op,
968 : uint32_t * dest,
969 : const uint32_t * src,
970 : const uint32_t * mask,
971 : int width)
972 : {
973 0 : const uint32_t *end = src + width;
974 :
975 0 : while (src < end)
976 : {
977 0 : __m64 a = load8888 (*mask);
978 0 : __m64 s = load8888 (*src);
979 0 : __m64 d = load8888 (*dest);
980 0 : __m64 da = expand_alpha (d);
981 0 : __m64 sa = expand_alpha (s);
982 :
983 0 : s = pix_multiply (s, a);
984 0 : a = pix_multiply (a, sa);
985 0 : a = negate (a);
986 0 : d = pix_add_mul (d, a, s, da);
987 0 : *dest = store8888 (d);
988 :
989 0 : ++src;
990 0 : ++dest;
991 0 : ++mask;
992 : }
993 : _mm_empty ();
994 0 : }
995 :
996 : static void
997 0 : mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
998 : pixman_op_t op,
999 : uint32_t * dest,
1000 : const uint32_t * src,
1001 : const uint32_t * mask,
1002 : int width)
1003 : {
1004 0 : const uint32_t *end = src + width;
1005 :
1006 0 : while (src < end)
1007 : {
1008 0 : __m64 a = load8888 (*mask);
1009 0 : __m64 s = load8888 (*src);
1010 0 : __m64 d = load8888 (*dest);
1011 0 : __m64 da = expand_alpha (d);
1012 0 : __m64 sa = expand_alpha (s);
1013 :
1014 0 : s = pix_multiply (s, a);
1015 0 : a = pix_multiply (a, sa);
1016 0 : da = negate (da);
1017 0 : d = pix_add_mul (d, a, s, da);
1018 0 : *dest = store8888 (d);
1019 :
1020 0 : ++src;
1021 0 : ++dest;
1022 0 : ++mask;
1023 : }
1024 : _mm_empty ();
1025 0 : }
1026 :
1027 : static void
1028 0 : mmx_combine_xor_ca (pixman_implementation_t *imp,
1029 : pixman_op_t op,
1030 : uint32_t * dest,
1031 : const uint32_t * src,
1032 : const uint32_t * mask,
1033 : int width)
1034 : {
1035 0 : const uint32_t *end = src + width;
1036 :
1037 0 : while (src < end)
1038 : {
1039 0 : __m64 a = load8888 (*mask);
1040 0 : __m64 s = load8888 (*src);
1041 0 : __m64 d = load8888 (*dest);
1042 0 : __m64 da = expand_alpha (d);
1043 0 : __m64 sa = expand_alpha (s);
1044 :
1045 0 : s = pix_multiply (s, a);
1046 0 : a = pix_multiply (a, sa);
1047 0 : da = negate (da);
1048 0 : a = negate (a);
1049 0 : d = pix_add_mul (d, a, s, da);
1050 0 : *dest = store8888 (d);
1051 :
1052 0 : ++src;
1053 0 : ++dest;
1054 0 : ++mask;
1055 : }
1056 : _mm_empty ();
1057 0 : }
1058 :
1059 : static void
1060 0 : mmx_combine_add_ca (pixman_implementation_t *imp,
1061 : pixman_op_t op,
1062 : uint32_t * dest,
1063 : const uint32_t * src,
1064 : const uint32_t * mask,
1065 : int width)
1066 : {
1067 0 : const uint32_t *end = src + width;
1068 :
1069 0 : while (src < end)
1070 : {
1071 0 : __m64 a = load8888 (*mask);
1072 0 : __m64 s = load8888 (*src);
1073 0 : __m64 d = load8888 (*dest);
1074 :
1075 0 : s = pix_multiply (s, a);
1076 0 : d = pix_add (s, d);
1077 0 : *dest = store8888 (d);
1078 :
1079 0 : ++src;
1080 0 : ++dest;
1081 0 : ++mask;
1082 : }
1083 : _mm_empty ();
1084 0 : }
1085 :
1086 : /* ------------- MMX code paths called from fbpict.c -------------------- */
1087 :
1088 : static void
1089 0 : mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1090 : pixman_op_t op,
1091 : pixman_image_t * src_image,
1092 : pixman_image_t * mask_image,
1093 : pixman_image_t * dst_image,
1094 : int32_t src_x,
1095 : int32_t src_y,
1096 : int32_t mask_x,
1097 : int32_t mask_y,
1098 : int32_t dest_x,
1099 : int32_t dest_y,
1100 : int32_t width,
1101 : int32_t height)
1102 : {
1103 : uint32_t src;
1104 : uint32_t *dst_line, *dst;
1105 : int32_t w;
1106 : int dst_stride;
1107 : __m64 vsrc, vsrca;
1108 :
1109 : CHECKPOINT ();
1110 :
1111 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1112 :
1113 0 : if (src == 0)
1114 0 : return;
1115 :
1116 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1117 :
1118 0 : vsrc = load8888 (src);
1119 0 : vsrca = expand_alpha (vsrc);
1120 :
1121 0 : while (height--)
1122 : {
1123 0 : dst = dst_line;
1124 0 : dst_line += dst_stride;
1125 0 : w = width;
1126 :
1127 : CHECKPOINT ();
1128 :
1129 0 : while (w && (unsigned long)dst & 7)
1130 : {
1131 0 : *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1132 :
1133 0 : w--;
1134 0 : dst++;
1135 : }
1136 :
1137 0 : while (w >= 2)
1138 : {
1139 : __m64 vdest;
1140 : __m64 dest0, dest1;
1141 :
1142 0 : vdest = *(__m64 *)dst;
1143 :
1144 0 : dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1145 0 : dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1146 :
1147 0 : *(__m64 *)dst = pack8888 (dest0, dest1);
1148 :
1149 0 : dst += 2;
1150 0 : w -= 2;
1151 : }
1152 :
1153 : CHECKPOINT ();
1154 :
1155 0 : while (w)
1156 : {
1157 0 : *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1158 :
1159 0 : w--;
1160 0 : dst++;
1161 : }
1162 : }
1163 :
1164 : _mm_empty ();
1165 : }
1166 :
1167 : static void
1168 0 : mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1169 : pixman_op_t op,
1170 : pixman_image_t * src_image,
1171 : pixman_image_t * mask_image,
1172 : pixman_image_t * dst_image,
1173 : int32_t src_x,
1174 : int32_t src_y,
1175 : int32_t mask_x,
1176 : int32_t mask_y,
1177 : int32_t dest_x,
1178 : int32_t dest_y,
1179 : int32_t width,
1180 : int32_t height)
1181 : {
1182 : uint32_t src;
1183 : uint16_t *dst_line, *dst;
1184 : int32_t w;
1185 : int dst_stride;
1186 : __m64 vsrc, vsrca;
1187 :
1188 : CHECKPOINT ();
1189 :
1190 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1191 :
1192 0 : if (src == 0)
1193 0 : return;
1194 :
1195 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1196 :
1197 0 : vsrc = load8888 (src);
1198 0 : vsrca = expand_alpha (vsrc);
1199 :
1200 0 : while (height--)
1201 : {
1202 0 : dst = dst_line;
1203 0 : dst_line += dst_stride;
1204 0 : w = width;
1205 :
1206 : CHECKPOINT ();
1207 :
1208 0 : while (w && (unsigned long)dst & 7)
1209 : {
1210 0 : uint64_t d = *dst;
1211 0 : __m64 vdest = expand565 (to_m64 (d), 0);
1212 :
1213 0 : vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1214 0 : *dst = to_uint64 (vdest);
1215 :
1216 0 : w--;
1217 0 : dst++;
1218 : }
1219 :
1220 0 : while (w >= 4)
1221 : {
1222 : __m64 vdest;
1223 :
1224 0 : vdest = *(__m64 *)dst;
1225 :
1226 0 : vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1227 0 : vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1228 0 : vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1229 0 : vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1230 :
1231 0 : *(__m64 *)dst = vdest;
1232 :
1233 0 : dst += 4;
1234 0 : w -= 4;
1235 : }
1236 :
1237 : CHECKPOINT ();
1238 :
1239 0 : while (w)
1240 : {
1241 0 : uint64_t d = *dst;
1242 0 : __m64 vdest = expand565 (to_m64 (d), 0);
1243 :
1244 0 : vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1245 0 : *dst = to_uint64 (vdest);
1246 :
1247 0 : w--;
1248 0 : dst++;
1249 : }
1250 : }
1251 :
1252 : _mm_empty ();
1253 : }
1254 :
1255 : static void
1256 0 : mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1257 : pixman_op_t op,
1258 : pixman_image_t * src_image,
1259 : pixman_image_t * mask_image,
1260 : pixman_image_t * dst_image,
1261 : int32_t src_x,
1262 : int32_t src_y,
1263 : int32_t mask_x,
1264 : int32_t mask_y,
1265 : int32_t dest_x,
1266 : int32_t dest_y,
1267 : int32_t width,
1268 : int32_t height)
1269 : {
1270 : uint32_t src, srca;
1271 : uint32_t *dst_line;
1272 : uint32_t *mask_line;
1273 : int dst_stride, mask_stride;
1274 : __m64 vsrc, vsrca;
1275 :
1276 : CHECKPOINT ();
1277 :
1278 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1279 :
1280 0 : srca = src >> 24;
1281 0 : if (src == 0)
1282 0 : return;
1283 :
1284 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1285 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1286 :
1287 0 : vsrc = load8888 (src);
1288 0 : vsrca = expand_alpha (vsrc);
1289 :
1290 0 : while (height--)
1291 : {
1292 0 : int twidth = width;
1293 0 : uint32_t *p = (uint32_t *)mask_line;
1294 0 : uint32_t *q = (uint32_t *)dst_line;
1295 :
1296 0 : while (twidth && (unsigned long)q & 7)
1297 : {
1298 0 : uint32_t m = *(uint32_t *)p;
1299 :
1300 0 : if (m)
1301 : {
1302 0 : __m64 vdest = load8888 (*q);
1303 0 : vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1304 0 : *q = store8888 (vdest);
1305 : }
1306 :
1307 0 : twidth--;
1308 0 : p++;
1309 0 : q++;
1310 : }
1311 :
1312 0 : while (twidth >= 2)
1313 : {
1314 : uint32_t m0, m1;
1315 0 : m0 = *p;
1316 0 : m1 = *(p + 1);
1317 :
1318 0 : if (m0 | m1)
1319 : {
1320 : __m64 dest0, dest1;
1321 0 : __m64 vdest = *(__m64 *)q;
1322 :
1323 0 : dest0 = in_over (vsrc, vsrca, load8888 (m0),
1324 : expand8888 (vdest, 0));
1325 0 : dest1 = in_over (vsrc, vsrca, load8888 (m1),
1326 : expand8888 (vdest, 1));
1327 :
1328 0 : *(__m64 *)q = pack8888 (dest0, dest1);
1329 : }
1330 :
1331 0 : p += 2;
1332 0 : q += 2;
1333 0 : twidth -= 2;
1334 : }
1335 :
1336 0 : while (twidth)
1337 : {
1338 0 : uint32_t m = *(uint32_t *)p;
1339 :
1340 0 : if (m)
1341 : {
1342 0 : __m64 vdest = load8888 (*q);
1343 0 : vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1344 0 : *q = store8888 (vdest);
1345 : }
1346 :
1347 0 : twidth--;
1348 0 : p++;
1349 0 : q++;
1350 : }
1351 :
1352 0 : dst_line += dst_stride;
1353 0 : mask_line += mask_stride;
1354 : }
1355 :
1356 : _mm_empty ();
1357 : }
1358 :
1359 : static void
1360 0 : mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1361 : pixman_op_t op,
1362 : pixman_image_t * src_image,
1363 : pixman_image_t * mask_image,
1364 : pixman_image_t * dst_image,
1365 : int32_t src_x,
1366 : int32_t src_y,
1367 : int32_t mask_x,
1368 : int32_t mask_y,
1369 : int32_t dest_x,
1370 : int32_t dest_y,
1371 : int32_t width,
1372 : int32_t height)
1373 : {
1374 : uint32_t *dst_line, *dst;
1375 : uint32_t *src_line, *src;
1376 : uint32_t mask;
1377 : __m64 vmask;
1378 : int dst_stride, src_stride;
1379 : int32_t w;
1380 : __m64 srca;
1381 :
1382 : CHECKPOINT ();
1383 :
1384 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1385 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1386 :
1387 0 : mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);
1388 0 : mask &= 0xff000000;
1389 0 : mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1390 0 : vmask = load8888 (mask);
1391 0 : srca = MC (4x00ff);
1392 :
1393 0 : while (height--)
1394 : {
1395 0 : dst = dst_line;
1396 0 : dst_line += dst_stride;
1397 0 : src = src_line;
1398 0 : src_line += src_stride;
1399 0 : w = width;
1400 :
1401 0 : while (w && (unsigned long)dst & 7)
1402 : {
1403 0 : __m64 s = load8888 (*src);
1404 0 : __m64 d = load8888 (*dst);
1405 :
1406 0 : *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1407 :
1408 0 : w--;
1409 0 : dst++;
1410 0 : src++;
1411 : }
1412 :
1413 0 : while (w >= 2)
1414 : {
1415 0 : __m64 vs = *(__m64 *)src;
1416 0 : __m64 vd = *(__m64 *)dst;
1417 0 : __m64 vsrc0 = expand8888 (vs, 0);
1418 0 : __m64 vsrc1 = expand8888 (vs, 1);
1419 :
1420 0 : *(__m64 *)dst = pack8888 (
1421 : in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1422 : in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1423 :
1424 0 : w -= 2;
1425 0 : dst += 2;
1426 0 : src += 2;
1427 : }
1428 :
1429 0 : while (w)
1430 : {
1431 0 : __m64 s = load8888 (*src);
1432 0 : __m64 d = load8888 (*dst);
1433 :
1434 0 : *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1435 :
1436 0 : w--;
1437 0 : dst++;
1438 0 : src++;
1439 : }
1440 : }
1441 :
1442 : _mm_empty ();
1443 0 : }
1444 :
1445 : static void
1446 0 : mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1447 : pixman_op_t op,
1448 : pixman_image_t * src_image,
1449 : pixman_image_t * mask_image,
1450 : pixman_image_t * dst_image,
1451 : int32_t src_x,
1452 : int32_t src_y,
1453 : int32_t mask_x,
1454 : int32_t mask_y,
1455 : int32_t dest_x,
1456 : int32_t dest_y,
1457 : int32_t width,
1458 : int32_t height)
1459 : {
1460 : uint32_t *dst_line, *dst;
1461 : uint32_t *src_line, *src;
1462 : uint32_t mask;
1463 : __m64 vmask;
1464 : int dst_stride, src_stride;
1465 : int32_t w;
1466 : __m64 srca;
1467 :
1468 : CHECKPOINT ();
1469 :
1470 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1471 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1472 0 : mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);
1473 :
1474 0 : mask &= 0xff000000;
1475 0 : mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1476 0 : vmask = load8888 (mask);
1477 0 : srca = MC (4x00ff);
1478 :
1479 0 : while (height--)
1480 : {
1481 0 : dst = dst_line;
1482 0 : dst_line += dst_stride;
1483 0 : src = src_line;
1484 0 : src_line += src_stride;
1485 0 : w = width;
1486 :
1487 0 : while (w && (unsigned long)dst & 7)
1488 : {
1489 0 : __m64 s = load8888 (*src | 0xff000000);
1490 0 : __m64 d = load8888 (*dst);
1491 :
1492 0 : *dst = store8888 (in_over (s, srca, vmask, d));
1493 :
1494 0 : w--;
1495 0 : dst++;
1496 0 : src++;
1497 : }
1498 :
1499 0 : while (w >= 16)
1500 : {
1501 0 : __m64 vd0 = *(__m64 *)(dst + 0);
1502 0 : __m64 vd1 = *(__m64 *)(dst + 2);
1503 0 : __m64 vd2 = *(__m64 *)(dst + 4);
1504 0 : __m64 vd3 = *(__m64 *)(dst + 6);
1505 0 : __m64 vd4 = *(__m64 *)(dst + 8);
1506 0 : __m64 vd5 = *(__m64 *)(dst + 10);
1507 0 : __m64 vd6 = *(__m64 *)(dst + 12);
1508 0 : __m64 vd7 = *(__m64 *)(dst + 14);
1509 :
1510 0 : __m64 vs0 = *(__m64 *)(src + 0);
1511 0 : __m64 vs1 = *(__m64 *)(src + 2);
1512 0 : __m64 vs2 = *(__m64 *)(src + 4);
1513 0 : __m64 vs3 = *(__m64 *)(src + 6);
1514 0 : __m64 vs4 = *(__m64 *)(src + 8);
1515 0 : __m64 vs5 = *(__m64 *)(src + 10);
1516 0 : __m64 vs6 = *(__m64 *)(src + 12);
1517 0 : __m64 vs7 = *(__m64 *)(src + 14);
1518 :
1519 0 : vd0 = pack8888 (
1520 : in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1521 : in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1522 :
1523 0 : vd1 = pack8888 (
1524 : in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1525 : in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1526 :
1527 0 : vd2 = pack8888 (
1528 : in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1529 : in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1530 :
1531 0 : vd3 = pack8888 (
1532 : in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1533 : in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1534 :
1535 0 : vd4 = pack8888 (
1536 : in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1537 : in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1538 :
1539 0 : vd5 = pack8888 (
1540 : in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1541 : in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1542 :
1543 0 : vd6 = pack8888 (
1544 : in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1545 : in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1546 :
1547 0 : vd7 = pack8888 (
1548 : in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1549 : in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1550 :
1551 0 : *(__m64 *)(dst + 0) = vd0;
1552 0 : *(__m64 *)(dst + 2) = vd1;
1553 0 : *(__m64 *)(dst + 4) = vd2;
1554 0 : *(__m64 *)(dst + 6) = vd3;
1555 0 : *(__m64 *)(dst + 8) = vd4;
1556 0 : *(__m64 *)(dst + 10) = vd5;
1557 0 : *(__m64 *)(dst + 12) = vd6;
1558 0 : *(__m64 *)(dst + 14) = vd7;
1559 :
1560 0 : w -= 16;
1561 0 : dst += 16;
1562 0 : src += 16;
1563 : }
1564 :
1565 0 : while (w)
1566 : {
1567 0 : __m64 s = load8888 (*src | 0xff000000);
1568 0 : __m64 d = load8888 (*dst);
1569 :
1570 0 : *dst = store8888 (in_over (s, srca, vmask, d));
1571 :
1572 0 : w--;
1573 0 : dst++;
1574 0 : src++;
1575 : }
1576 : }
1577 :
1578 : _mm_empty ();
1579 0 : }
1580 :
1581 : static void
1582 0 : mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1583 : pixman_op_t op,
1584 : pixman_image_t * src_image,
1585 : pixman_image_t * mask_image,
1586 : pixman_image_t * dst_image,
1587 : int32_t src_x,
1588 : int32_t src_y,
1589 : int32_t mask_x,
1590 : int32_t mask_y,
1591 : int32_t dest_x,
1592 : int32_t dest_y,
1593 : int32_t width,
1594 : int32_t height)
1595 : {
1596 : uint32_t *dst_line, *dst;
1597 : uint32_t *src_line, *src;
1598 : uint32_t s;
1599 : int dst_stride, src_stride;
1600 : uint8_t a;
1601 : int32_t w;
1602 :
1603 : CHECKPOINT ();
1604 :
1605 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1606 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1607 :
1608 0 : while (height--)
1609 : {
1610 0 : dst = dst_line;
1611 0 : dst_line += dst_stride;
1612 0 : src = src_line;
1613 0 : src_line += src_stride;
1614 0 : w = width;
1615 :
1616 0 : while (w--)
1617 : {
1618 0 : s = *src++;
1619 0 : a = s >> 24;
1620 :
1621 0 : if (a == 0xff)
1622 : {
1623 0 : *dst = s;
1624 : }
1625 0 : else if (s)
1626 : {
1627 : __m64 ms, sa;
1628 0 : ms = load8888 (s);
1629 0 : sa = expand_alpha (ms);
1630 0 : *dst = store8888 (over (ms, sa, load8888 (*dst)));
1631 : }
1632 :
1633 0 : dst++;
1634 : }
1635 : }
1636 : _mm_empty ();
1637 0 : }
1638 :
1639 : static void
1640 0 : mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1641 : pixman_op_t op,
1642 : pixman_image_t * src_image,
1643 : pixman_image_t * mask_image,
1644 : pixman_image_t * dst_image,
1645 : int32_t src_x,
1646 : int32_t src_y,
1647 : int32_t mask_x,
1648 : int32_t mask_y,
1649 : int32_t dest_x,
1650 : int32_t dest_y,
1651 : int32_t width,
1652 : int32_t height)
1653 : {
1654 : uint16_t *dst_line, *dst;
1655 : uint32_t *src_line, *src;
1656 : int dst_stride, src_stride;
1657 : int32_t w;
1658 :
1659 : CHECKPOINT ();
1660 :
1661 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1662 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1663 :
1664 : #if 0
1665 : /* FIXME */
1666 : assert (src_image->drawable == mask_image->drawable);
1667 : #endif
1668 :
1669 0 : while (height--)
1670 : {
1671 0 : dst = dst_line;
1672 0 : dst_line += dst_stride;
1673 0 : src = src_line;
1674 0 : src_line += src_stride;
1675 0 : w = width;
1676 :
1677 : CHECKPOINT ();
1678 :
1679 0 : while (w && (unsigned long)dst & 7)
1680 : {
1681 0 : __m64 vsrc = load8888 (*src);
1682 0 : uint64_t d = *dst;
1683 0 : __m64 vdest = expand565 (to_m64 (d), 0);
1684 :
1685 0 : vdest = pack_565 (
1686 : over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1687 :
1688 0 : *dst = to_uint64 (vdest);
1689 :
1690 0 : w--;
1691 0 : dst++;
1692 0 : src++;
1693 : }
1694 :
1695 : CHECKPOINT ();
1696 :
1697 0 : while (w >= 4)
1698 : {
1699 : __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1700 : __m64 vdest;
1701 :
1702 0 : vsrc0 = load8888 (*(src + 0));
1703 0 : vsrc1 = load8888 (*(src + 1));
1704 0 : vsrc2 = load8888 (*(src + 2));
1705 0 : vsrc3 = load8888 (*(src + 3));
1706 :
1707 0 : vdest = *(__m64 *)dst;
1708 :
1709 0 : vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1710 0 : vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1711 0 : vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1712 0 : vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1713 :
1714 0 : *(__m64 *)dst = vdest;
1715 :
1716 0 : w -= 4;
1717 0 : dst += 4;
1718 0 : src += 4;
1719 : }
1720 :
1721 : CHECKPOINT ();
1722 :
1723 0 : while (w)
1724 : {
1725 0 : __m64 vsrc = load8888 (*src);
1726 0 : uint64_t d = *dst;
1727 0 : __m64 vdest = expand565 (to_m64 (d), 0);
1728 :
1729 0 : vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1730 :
1731 0 : *dst = to_uint64 (vdest);
1732 :
1733 0 : w--;
1734 0 : dst++;
1735 0 : src++;
1736 : }
1737 : }
1738 :
1739 : _mm_empty ();
1740 0 : }
1741 :
1742 : static void
1743 0 : mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1744 : pixman_op_t op,
1745 : pixman_image_t * src_image,
1746 : pixman_image_t * mask_image,
1747 : pixman_image_t * dst_image,
1748 : int32_t src_x,
1749 : int32_t src_y,
1750 : int32_t mask_x,
1751 : int32_t mask_y,
1752 : int32_t dest_x,
1753 : int32_t dest_y,
1754 : int32_t width,
1755 : int32_t height)
1756 : {
1757 : uint32_t src, srca;
1758 : uint32_t *dst_line, *dst;
1759 : uint8_t *mask_line, *mask;
1760 : int dst_stride, mask_stride;
1761 : int32_t w;
1762 : __m64 vsrc, vsrca;
1763 : uint64_t srcsrc;
1764 :
1765 : CHECKPOINT ();
1766 :
1767 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1768 :
1769 0 : srca = src >> 24;
1770 0 : if (src == 0)
1771 0 : return;
1772 :
1773 0 : srcsrc = (uint64_t)src << 32 | src;
1774 :
1775 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1776 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1777 :
1778 0 : vsrc = load8888 (src);
1779 0 : vsrca = expand_alpha (vsrc);
1780 :
1781 0 : while (height--)
1782 : {
1783 0 : dst = dst_line;
1784 0 : dst_line += dst_stride;
1785 0 : mask = mask_line;
1786 0 : mask_line += mask_stride;
1787 0 : w = width;
1788 :
1789 : CHECKPOINT ();
1790 :
1791 0 : while (w && (unsigned long)dst & 7)
1792 : {
1793 0 : uint64_t m = *mask;
1794 :
1795 0 : if (m)
1796 : {
1797 0 : __m64 vdest = in_over (vsrc, vsrca,
1798 : expand_alpha_rev (to_m64 (m)),
1799 : load8888 (*dst));
1800 :
1801 0 : *dst = store8888 (vdest);
1802 : }
1803 :
1804 0 : w--;
1805 0 : mask++;
1806 0 : dst++;
1807 : }
1808 :
1809 : CHECKPOINT ();
1810 :
1811 0 : while (w >= 2)
1812 : {
1813 : uint64_t m0, m1;
1814 :
1815 0 : m0 = *mask;
1816 0 : m1 = *(mask + 1);
1817 :
1818 0 : if (srca == 0xff && (m0 & m1) == 0xff)
1819 : {
1820 0 : *(uint64_t *)dst = srcsrc;
1821 : }
1822 0 : else if (m0 | m1)
1823 : {
1824 : __m64 vdest;
1825 : __m64 dest0, dest1;
1826 :
1827 0 : vdest = *(__m64 *)dst;
1828 :
1829 0 : dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1830 : expand8888 (vdest, 0));
1831 0 : dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1832 : expand8888 (vdest, 1));
1833 :
1834 0 : *(__m64 *)dst = pack8888 (dest0, dest1);
1835 : }
1836 :
1837 0 : mask += 2;
1838 0 : dst += 2;
1839 0 : w -= 2;
1840 : }
1841 :
1842 : CHECKPOINT ();
1843 :
1844 0 : while (w)
1845 : {
1846 0 : uint64_t m = *mask;
1847 :
1848 0 : if (m)
1849 : {
1850 0 : __m64 vdest = load8888 (*dst);
1851 :
1852 0 : vdest = in_over (
1853 : vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1854 0 : *dst = store8888 (vdest);
1855 : }
1856 :
1857 0 : w--;
1858 0 : mask++;
1859 0 : dst++;
1860 : }
1861 : }
1862 :
1863 : _mm_empty ();
1864 : }
1865 :
1866 : pixman_bool_t
1867 0 : pixman_fill_mmx (uint32_t *bits,
1868 : int stride,
1869 : int bpp,
1870 : int x,
1871 : int y,
1872 : int width,
1873 : int height,
1874 : uint32_t xor)
1875 : {
1876 : uint64_t fill;
1877 : __m64 vfill;
1878 : uint32_t byte_width;
1879 : uint8_t *byte_line;
1880 :
1881 : #ifdef __GNUC__
1882 : __m64 v1, v2, v3, v4, v5, v6, v7;
1883 : #endif
1884 :
1885 0 : if (bpp != 16 && bpp != 32 && bpp != 8)
1886 0 : return FALSE;
1887 :
1888 0 : if (bpp == 8)
1889 : {
1890 0 : stride = stride * (int) sizeof (uint32_t) / 1;
1891 0 : byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1892 0 : byte_width = width;
1893 0 : stride *= 1;
1894 0 : xor = (xor & 0xff) * 0x01010101;
1895 : }
1896 0 : else if (bpp == 16)
1897 : {
1898 0 : stride = stride * (int) sizeof (uint32_t) / 2;
1899 0 : byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1900 0 : byte_width = 2 * width;
1901 0 : stride *= 2;
1902 0 : xor = (xor & 0xffff) * 0x00010001;
1903 : }
1904 : else
1905 : {
1906 0 : stride = stride * (int) sizeof (uint32_t) / 4;
1907 0 : byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1908 0 : byte_width = 4 * width;
1909 0 : stride *= 4;
1910 : }
1911 :
1912 0 : fill = ((uint64_t)xor << 32) | xor;
1913 0 : vfill = to_m64 (fill);
1914 :
1915 : #ifdef __GNUC__
1916 0 : __asm__ (
1917 : "movq %7, %0\n"
1918 : "movq %7, %1\n"
1919 : "movq %7, %2\n"
1920 : "movq %7, %3\n"
1921 : "movq %7, %4\n"
1922 : "movq %7, %5\n"
1923 : "movq %7, %6\n"
1924 : : "=&y" (v1), "=&y" (v2), "=&y" (v3),
1925 : "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1926 : : "y" (vfill));
1927 : #endif
1928 :
1929 0 : while (height--)
1930 : {
1931 : int w;
1932 0 : uint8_t *d = byte_line;
1933 :
1934 0 : byte_line += stride;
1935 0 : w = byte_width;
1936 :
1937 0 : while (w >= 1 && ((unsigned long)d & 1))
1938 : {
1939 0 : *(uint8_t *)d = (xor & 0xff);
1940 0 : w--;
1941 0 : d++;
1942 : }
1943 :
1944 0 : while (w >= 2 && ((unsigned long)d & 3))
1945 : {
1946 0 : *(uint16_t *)d = xor;
1947 0 : w -= 2;
1948 0 : d += 2;
1949 : }
1950 :
1951 0 : while (w >= 4 && ((unsigned long)d & 7))
1952 : {
1953 0 : *(uint32_t *)d = xor;
1954 :
1955 0 : w -= 4;
1956 0 : d += 4;
1957 : }
1958 :
1959 0 : while (w >= 64)
1960 : {
1961 : #ifdef __GNUC__
1962 0 : __asm__ (
1963 : "movq %1, (%0)\n"
1964 : "movq %2, 8(%0)\n"
1965 : "movq %3, 16(%0)\n"
1966 : "movq %4, 24(%0)\n"
1967 : "movq %5, 32(%0)\n"
1968 : "movq %6, 40(%0)\n"
1969 : "movq %7, 48(%0)\n"
1970 : "movq %8, 56(%0)\n"
1971 : :
1972 : : "r" (d),
1973 : "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1974 : "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1975 : : "memory");
1976 : #else
1977 : *(__m64*) (d + 0) = vfill;
1978 : *(__m64*) (d + 8) = vfill;
1979 : *(__m64*) (d + 16) = vfill;
1980 : *(__m64*) (d + 24) = vfill;
1981 : *(__m64*) (d + 32) = vfill;
1982 : *(__m64*) (d + 40) = vfill;
1983 : *(__m64*) (d + 48) = vfill;
1984 : *(__m64*) (d + 56) = vfill;
1985 : #endif
1986 0 : w -= 64;
1987 0 : d += 64;
1988 : }
1989 :
1990 0 : while (w >= 4)
1991 : {
1992 0 : *(uint32_t *)d = xor;
1993 :
1994 0 : w -= 4;
1995 0 : d += 4;
1996 : }
1997 0 : while (w >= 2)
1998 : {
1999 0 : *(uint16_t *)d = xor;
2000 0 : w -= 2;
2001 0 : d += 2;
2002 : }
2003 0 : while (w >= 1)
2004 : {
2005 0 : *(uint8_t *)d = (xor & 0xff);
2006 0 : w--;
2007 0 : d++;
2008 : }
2009 :
2010 : }
2011 :
2012 : _mm_empty ();
2013 0 : return TRUE;
2014 : }
2015 :
2016 : static void
2017 0 : mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2018 : pixman_op_t op,
2019 : pixman_image_t * src_image,
2020 : pixman_image_t * mask_image,
2021 : pixman_image_t * dst_image,
2022 : int32_t src_x,
2023 : int32_t src_y,
2024 : int32_t mask_x,
2025 : int32_t mask_y,
2026 : int32_t dest_x,
2027 : int32_t dest_y,
2028 : int32_t width,
2029 : int32_t height)
2030 : {
2031 : uint32_t src, srca;
2032 : uint32_t *dst_line, *dst;
2033 : uint8_t *mask_line, *mask;
2034 : int dst_stride, mask_stride;
2035 : int32_t w;
2036 : __m64 vsrc, vsrca;
2037 : uint64_t srcsrc;
2038 :
2039 : CHECKPOINT ();
2040 :
2041 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2042 :
2043 0 : srca = src >> 24;
2044 0 : if (src == 0)
2045 : {
2046 0 : pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
2047 0 : PIXMAN_FORMAT_BPP (dst_image->bits.format),
2048 : dest_x, dest_y, width, height, 0);
2049 0 : return;
2050 : }
2051 :
2052 0 : srcsrc = (uint64_t)src << 32 | src;
2053 :
2054 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2055 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2056 :
2057 0 : vsrc = load8888 (src);
2058 0 : vsrca = expand_alpha (vsrc);
2059 :
2060 0 : while (height--)
2061 : {
2062 0 : dst = dst_line;
2063 0 : dst_line += dst_stride;
2064 0 : mask = mask_line;
2065 0 : mask_line += mask_stride;
2066 0 : w = width;
2067 :
2068 : CHECKPOINT ();
2069 :
2070 0 : while (w && (unsigned long)dst & 7)
2071 : {
2072 0 : uint64_t m = *mask;
2073 :
2074 0 : if (m)
2075 : {
2076 0 : __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2077 :
2078 0 : *dst = store8888 (vdest);
2079 : }
2080 : else
2081 : {
2082 0 : *dst = 0;
2083 : }
2084 :
2085 0 : w--;
2086 0 : mask++;
2087 0 : dst++;
2088 : }
2089 :
2090 : CHECKPOINT ();
2091 :
2092 0 : while (w >= 2)
2093 : {
2094 : uint64_t m0, m1;
2095 0 : m0 = *mask;
2096 0 : m1 = *(mask + 1);
2097 :
2098 0 : if (srca == 0xff && (m0 & m1) == 0xff)
2099 : {
2100 0 : *(uint64_t *)dst = srcsrc;
2101 : }
2102 0 : else if (m0 | m1)
2103 : {
2104 : __m64 vdest;
2105 : __m64 dest0, dest1;
2106 :
2107 0 : vdest = *(__m64 *)dst;
2108 :
2109 0 : dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2110 0 : dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2111 :
2112 0 : *(__m64 *)dst = pack8888 (dest0, dest1);
2113 : }
2114 : else
2115 : {
2116 0 : *(uint64_t *)dst = 0;
2117 : }
2118 :
2119 0 : mask += 2;
2120 0 : dst += 2;
2121 0 : w -= 2;
2122 : }
2123 :
2124 : CHECKPOINT ();
2125 :
2126 0 : while (w)
2127 : {
2128 0 : uint64_t m = *mask;
2129 :
2130 0 : if (m)
2131 : {
2132 0 : __m64 vdest = load8888 (*dst);
2133 :
2134 0 : vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2135 0 : *dst = store8888 (vdest);
2136 : }
2137 : else
2138 : {
2139 0 : *dst = 0;
2140 : }
2141 :
2142 0 : w--;
2143 0 : mask++;
2144 0 : dst++;
2145 : }
2146 : }
2147 :
2148 : _mm_empty ();
2149 : }
2150 :
2151 : static void
2152 0 : mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2153 : pixman_op_t op,
2154 : pixman_image_t * src_image,
2155 : pixman_image_t * mask_image,
2156 : pixman_image_t * dst_image,
2157 : int32_t src_x,
2158 : int32_t src_y,
2159 : int32_t mask_x,
2160 : int32_t mask_y,
2161 : int32_t dest_x,
2162 : int32_t dest_y,
2163 : int32_t width,
2164 : int32_t height)
2165 : {
2166 : uint32_t src, srca;
2167 : uint16_t *dst_line, *dst;
2168 : uint8_t *mask_line, *mask;
2169 : int dst_stride, mask_stride;
2170 : int32_t w;
2171 : __m64 vsrc, vsrca, tmp;
2172 : uint64_t srcsrcsrcsrc, src16;
2173 :
2174 : CHECKPOINT ();
2175 :
2176 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2177 :
2178 0 : srca = src >> 24;
2179 0 : if (src == 0)
2180 0 : return;
2181 :
2182 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2183 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2184 :
2185 0 : vsrc = load8888 (src);
2186 0 : vsrca = expand_alpha (vsrc);
2187 :
2188 0 : tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2189 0 : src16 = to_uint64 (tmp);
2190 :
2191 0 : srcsrcsrcsrc =
2192 0 : (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2193 0 : (uint64_t)src16 << 16 | (uint64_t)src16;
2194 :
2195 0 : while (height--)
2196 : {
2197 0 : dst = dst_line;
2198 0 : dst_line += dst_stride;
2199 0 : mask = mask_line;
2200 0 : mask_line += mask_stride;
2201 0 : w = width;
2202 :
2203 : CHECKPOINT ();
2204 :
2205 0 : while (w && (unsigned long)dst & 7)
2206 : {
2207 0 : uint64_t m = *mask;
2208 :
2209 0 : if (m)
2210 : {
2211 0 : uint64_t d = *dst;
2212 0 : __m64 vd = to_m64 (d);
2213 0 : __m64 vdest = in_over (
2214 : vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2215 :
2216 0 : vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2217 0 : *dst = to_uint64 (vd);
2218 : }
2219 :
2220 0 : w--;
2221 0 : mask++;
2222 0 : dst++;
2223 : }
2224 :
2225 : CHECKPOINT ();
2226 :
2227 0 : while (w >= 4)
2228 : {
2229 : uint64_t m0, m1, m2, m3;
2230 0 : m0 = *mask;
2231 0 : m1 = *(mask + 1);
2232 0 : m2 = *(mask + 2);
2233 0 : m3 = *(mask + 3);
2234 :
2235 0 : if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2236 : {
2237 0 : *(uint64_t *)dst = srcsrcsrcsrc;
2238 : }
2239 0 : else if (m0 | m1 | m2 | m3)
2240 : {
2241 : __m64 vdest;
2242 : __m64 vm0, vm1, vm2, vm3;
2243 :
2244 0 : vdest = *(__m64 *)dst;
2245 :
2246 0 : vm0 = to_m64 (m0);
2247 0 : vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2248 : expand565 (vdest, 0)), vdest, 0);
2249 0 : vm1 = to_m64 (m1);
2250 0 : vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2251 : expand565 (vdest, 1)), vdest, 1);
2252 0 : vm2 = to_m64 (m2);
2253 0 : vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2254 : expand565 (vdest, 2)), vdest, 2);
2255 0 : vm3 = to_m64 (m3);
2256 0 : vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2257 : expand565 (vdest, 3)), vdest, 3);
2258 :
2259 0 : *(__m64 *)dst = vdest;
2260 : }
2261 :
2262 0 : w -= 4;
2263 0 : mask += 4;
2264 0 : dst += 4;
2265 : }
2266 :
2267 : CHECKPOINT ();
2268 :
2269 0 : while (w)
2270 : {
2271 0 : uint64_t m = *mask;
2272 :
2273 0 : if (m)
2274 : {
2275 0 : uint64_t d = *dst;
2276 0 : __m64 vd = to_m64 (d);
2277 0 : __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2278 : expand565 (vd, 0));
2279 0 : vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2280 0 : *dst = to_uint64 (vd);
2281 : }
2282 :
2283 0 : w--;
2284 0 : mask++;
2285 0 : dst++;
2286 : }
2287 : }
2288 :
2289 : _mm_empty ();
2290 : }
2291 :
2292 : static void
2293 0 : mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2294 : pixman_op_t op,
2295 : pixman_image_t * src_image,
2296 : pixman_image_t * mask_image,
2297 : pixman_image_t * dst_image,
2298 : int32_t src_x,
2299 : int32_t src_y,
2300 : int32_t mask_x,
2301 : int32_t mask_y,
2302 : int32_t dest_x,
2303 : int32_t dest_y,
2304 : int32_t width,
2305 : int32_t height)
2306 : {
2307 : uint16_t *dst_line, *dst;
2308 : uint32_t *src_line, *src;
2309 : int dst_stride, src_stride;
2310 : int32_t w;
2311 :
2312 : CHECKPOINT ();
2313 :
2314 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2315 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2316 :
2317 : #if 0
2318 : /* FIXME */
2319 : assert (src_image->drawable == mask_image->drawable);
2320 : #endif
2321 :
2322 0 : while (height--)
2323 : {
2324 0 : dst = dst_line;
2325 0 : dst_line += dst_stride;
2326 0 : src = src_line;
2327 0 : src_line += src_stride;
2328 0 : w = width;
2329 :
2330 : CHECKPOINT ();
2331 :
2332 0 : while (w && (unsigned long)dst & 7)
2333 : {
2334 0 : __m64 vsrc = load8888 (*src);
2335 0 : uint64_t d = *dst;
2336 0 : __m64 vdest = expand565 (to_m64 (d), 0);
2337 :
2338 0 : vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2339 :
2340 0 : *dst = to_uint64 (vdest);
2341 :
2342 0 : w--;
2343 0 : dst++;
2344 0 : src++;
2345 : }
2346 :
2347 : CHECKPOINT ();
2348 :
2349 0 : while (w >= 4)
2350 : {
2351 : uint32_t s0, s1, s2, s3;
2352 : unsigned char a0, a1, a2, a3;
2353 :
2354 0 : s0 = *src;
2355 0 : s1 = *(src + 1);
2356 0 : s2 = *(src + 2);
2357 0 : s3 = *(src + 3);
2358 :
2359 0 : a0 = (s0 >> 24);
2360 0 : a1 = (s1 >> 24);
2361 0 : a2 = (s2 >> 24);
2362 0 : a3 = (s3 >> 24);
2363 :
2364 0 : if ((a0 & a1 & a2 & a3) == 0xFF)
2365 : {
2366 : __m64 vdest;
2367 0 : vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2368 0 : vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2369 0 : vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2370 0 : vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2371 :
2372 0 : *(__m64 *)dst = vdest;
2373 : }
2374 0 : else if (s0 | s1 | s2 | s3)
2375 : {
2376 0 : __m64 vdest = *(__m64 *)dst;
2377 :
2378 0 : vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2379 0 : vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2380 0 : vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2381 0 : vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2382 :
2383 0 : *(__m64 *)dst = vdest;
2384 : }
2385 :
2386 0 : w -= 4;
2387 0 : dst += 4;
2388 0 : src += 4;
2389 : }
2390 :
2391 : CHECKPOINT ();
2392 :
2393 0 : while (w)
2394 : {
2395 0 : __m64 vsrc = load8888 (*src);
2396 0 : uint64_t d = *dst;
2397 0 : __m64 vdest = expand565 (to_m64 (d), 0);
2398 :
2399 0 : vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2400 :
2401 0 : *dst = to_uint64 (vdest);
2402 :
2403 0 : w--;
2404 0 : dst++;
2405 0 : src++;
2406 : }
2407 : }
2408 :
2409 : _mm_empty ();
2410 0 : }
2411 :
2412 : static void
2413 0 : mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2414 : pixman_op_t op,
2415 : pixman_image_t * src_image,
2416 : pixman_image_t * mask_image,
2417 : pixman_image_t * dst_image,
2418 : int32_t src_x,
2419 : int32_t src_y,
2420 : int32_t mask_x,
2421 : int32_t mask_y,
2422 : int32_t dest_x,
2423 : int32_t dest_y,
2424 : int32_t width,
2425 : int32_t height)
2426 : {
2427 : uint32_t *dst_line, *dst;
2428 : uint32_t *src_line, *src;
2429 : int dst_stride, src_stride;
2430 : int32_t w;
2431 :
2432 : CHECKPOINT ();
2433 :
2434 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2435 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2436 :
2437 : #if 0
2438 : /* FIXME */
2439 : assert (src_image->drawable == mask_image->drawable);
2440 : #endif
2441 :
2442 0 : while (height--)
2443 : {
2444 0 : dst = dst_line;
2445 0 : dst_line += dst_stride;
2446 0 : src = src_line;
2447 0 : src_line += src_stride;
2448 0 : w = width;
2449 :
2450 0 : while (w && (unsigned long)dst & 7)
2451 : {
2452 0 : __m64 s = load8888 (*src);
2453 0 : __m64 d = load8888 (*dst);
2454 :
2455 0 : *dst = store8888 (over_rev_non_pre (s, d));
2456 :
2457 0 : w--;
2458 0 : dst++;
2459 0 : src++;
2460 : }
2461 :
2462 0 : while (w >= 2)
2463 : {
2464 : uint64_t s0, s1;
2465 : unsigned char a0, a1;
2466 : __m64 d0, d1;
2467 :
2468 0 : s0 = *src;
2469 0 : s1 = *(src + 1);
2470 :
2471 0 : a0 = (s0 >> 24);
2472 0 : a1 = (s1 >> 24);
2473 :
2474 0 : if ((a0 & a1) == 0xFF)
2475 : {
2476 0 : d0 = invert_colors (load8888 (s0));
2477 0 : d1 = invert_colors (load8888 (s1));
2478 :
2479 0 : *(__m64 *)dst = pack8888 (d0, d1);
2480 : }
2481 0 : else if (s0 | s1)
2482 : {
2483 0 : __m64 vdest = *(__m64 *)dst;
2484 :
2485 0 : d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2486 0 : d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2487 :
2488 0 : *(__m64 *)dst = pack8888 (d0, d1);
2489 : }
2490 :
2491 0 : w -= 2;
2492 0 : dst += 2;
2493 0 : src += 2;
2494 : }
2495 :
2496 0 : while (w)
2497 : {
2498 0 : __m64 s = load8888 (*src);
2499 0 : __m64 d = load8888 (*dst);
2500 :
2501 0 : *dst = store8888 (over_rev_non_pre (s, d));
2502 :
2503 0 : w--;
2504 0 : dst++;
2505 0 : src++;
2506 : }
2507 : }
2508 :
2509 : _mm_empty ();
2510 0 : }
2511 :
2512 : static void
2513 0 : mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2514 : pixman_op_t op,
2515 : pixman_image_t * src_image,
2516 : pixman_image_t * mask_image,
2517 : pixman_image_t * dst_image,
2518 : int32_t src_x,
2519 : int32_t src_y,
2520 : int32_t mask_x,
2521 : int32_t mask_y,
2522 : int32_t dest_x,
2523 : int32_t dest_y,
2524 : int32_t width,
2525 : int32_t height)
2526 : {
2527 : uint32_t src, srca;
2528 : uint16_t *dst_line;
2529 : uint32_t *mask_line;
2530 : int dst_stride, mask_stride;
2531 : __m64 vsrc, vsrca;
2532 :
2533 : CHECKPOINT ();
2534 :
2535 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2536 :
2537 0 : srca = src >> 24;
2538 0 : if (src == 0)
2539 0 : return;
2540 :
2541 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2542 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2543 :
2544 0 : vsrc = load8888 (src);
2545 0 : vsrca = expand_alpha (vsrc);
2546 :
2547 0 : while (height--)
2548 : {
2549 0 : int twidth = width;
2550 0 : uint32_t *p = (uint32_t *)mask_line;
2551 0 : uint16_t *q = (uint16_t *)dst_line;
2552 :
2553 0 : while (twidth && ((unsigned long)q & 7))
2554 : {
2555 0 : uint32_t m = *(uint32_t *)p;
2556 :
2557 0 : if (m)
2558 : {
2559 0 : uint64_t d = *q;
2560 0 : __m64 vdest = expand565 (to_m64 (d), 0);
2561 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2562 0 : *q = to_uint64 (vdest);
2563 : }
2564 :
2565 0 : twidth--;
2566 0 : p++;
2567 0 : q++;
2568 : }
2569 :
2570 0 : while (twidth >= 4)
2571 : {
2572 : uint32_t m0, m1, m2, m3;
2573 :
2574 0 : m0 = *p;
2575 0 : m1 = *(p + 1);
2576 0 : m2 = *(p + 2);
2577 0 : m3 = *(p + 3);
2578 :
2579 0 : if ((m0 | m1 | m2 | m3))
2580 : {
2581 0 : __m64 vdest = *(__m64 *)q;
2582 :
2583 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2584 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2585 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2586 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2587 :
2588 0 : *(__m64 *)q = vdest;
2589 : }
2590 0 : twidth -= 4;
2591 0 : p += 4;
2592 0 : q += 4;
2593 : }
2594 :
2595 0 : while (twidth)
2596 : {
2597 : uint32_t m;
2598 :
2599 0 : m = *(uint32_t *)p;
2600 0 : if (m)
2601 : {
2602 0 : uint64_t d = *q;
2603 0 : __m64 vdest = expand565 (to_m64 (d), 0);
2604 0 : vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2605 0 : *q = to_uint64 (vdest);
2606 : }
2607 :
2608 0 : twidth--;
2609 0 : p++;
2610 0 : q++;
2611 : }
2612 :
2613 0 : mask_line += mask_stride;
2614 0 : dst_line += dst_stride;
2615 : }
2616 :
2617 : _mm_empty ();
2618 : }
2619 :
2620 : static void
2621 0 : mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2622 : pixman_op_t op,
2623 : pixman_image_t * src_image,
2624 : pixman_image_t * mask_image,
2625 : pixman_image_t * dst_image,
2626 : int32_t src_x,
2627 : int32_t src_y,
2628 : int32_t mask_x,
2629 : int32_t mask_y,
2630 : int32_t dest_x,
2631 : int32_t dest_y,
2632 : int32_t width,
2633 : int32_t height)
2634 : {
2635 : uint8_t *dst_line, *dst;
2636 : uint8_t *mask_line, *mask;
2637 : int dst_stride, mask_stride;
2638 : int32_t w;
2639 : uint32_t src;
2640 : uint8_t sa;
2641 : __m64 vsrc, vsrca;
2642 :
2643 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2644 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2645 :
2646 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2647 :
2648 0 : sa = src >> 24;
2649 :
2650 0 : vsrc = load8888 (src);
2651 0 : vsrca = expand_alpha (vsrc);
2652 :
2653 0 : while (height--)
2654 : {
2655 0 : dst = dst_line;
2656 0 : dst_line += dst_stride;
2657 0 : mask = mask_line;
2658 0 : mask_line += mask_stride;
2659 0 : w = width;
2660 :
2661 0 : if ((((unsigned long)dst_image & 3) == 0) &&
2662 0 : (((unsigned long)src_image & 3) == 0))
2663 : {
2664 0 : while (w >= 4)
2665 : {
2666 : uint32_t m;
2667 : __m64 vmask;
2668 : __m64 vdest;
2669 :
2670 0 : m = 0;
2671 :
2672 0 : vmask = load8888 (*(uint32_t *)mask);
2673 0 : vdest = load8888 (*(uint32_t *)dst);
2674 :
2675 0 : *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2676 :
2677 0 : dst += 4;
2678 0 : mask += 4;
2679 0 : w -= 4;
2680 : }
2681 : }
2682 :
2683 0 : while (w--)
2684 : {
2685 : uint16_t tmp;
2686 : uint8_t a;
2687 : uint32_t m, d;
2688 :
2689 0 : a = *mask++;
2690 0 : d = *dst;
2691 :
2692 0 : m = MUL_UN8 (sa, a, tmp);
2693 0 : d = MUL_UN8 (m, d, tmp);
2694 :
2695 0 : *dst++ = d;
2696 : }
2697 : }
2698 :
2699 : _mm_empty ();
2700 0 : }
2701 :
2702 : static void
2703 0 : mmx_composite_in_8_8 (pixman_implementation_t *imp,
2704 : pixman_op_t op,
2705 : pixman_image_t * src_image,
2706 : pixman_image_t * mask_image,
2707 : pixman_image_t * dst_image,
2708 : int32_t src_x,
2709 : int32_t src_y,
2710 : int32_t mask_x,
2711 : int32_t mask_y,
2712 : int32_t dest_x,
2713 : int32_t dest_y,
2714 : int32_t width,
2715 : int32_t height)
2716 : {
2717 : uint8_t *dst_line, *dst;
2718 : uint8_t *src_line, *src;
2719 : int src_stride, dst_stride;
2720 : int32_t w;
2721 :
2722 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2723 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2724 :
2725 0 : while (height--)
2726 : {
2727 0 : dst = dst_line;
2728 0 : dst_line += dst_stride;
2729 0 : src = src_line;
2730 0 : src_line += src_stride;
2731 0 : w = width;
2732 :
2733 0 : if ((((unsigned long)dst_image & 3) == 0) &&
2734 0 : (((unsigned long)src_image & 3) == 0))
2735 : {
2736 0 : while (w >= 4)
2737 : {
2738 0 : uint32_t *s = (uint32_t *)src;
2739 0 : uint32_t *d = (uint32_t *)dst;
2740 :
2741 0 : *d = store8888 (in (load8888 (*s), load8888 (*d)));
2742 :
2743 0 : w -= 4;
2744 0 : dst += 4;
2745 0 : src += 4;
2746 : }
2747 : }
2748 :
2749 0 : while (w--)
2750 : {
2751 : uint8_t s, d;
2752 : uint16_t tmp;
2753 :
2754 0 : s = *src;
2755 0 : d = *dst;
2756 :
2757 0 : *dst = MUL_UN8 (s, d, tmp);
2758 :
2759 0 : src++;
2760 0 : dst++;
2761 : }
2762 : }
2763 :
2764 : _mm_empty ();
2765 0 : }
2766 :
2767 : static void
2768 0 : mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2769 : pixman_op_t op,
2770 : pixman_image_t * src_image,
2771 : pixman_image_t * mask_image,
2772 : pixman_image_t * dst_image,
2773 : int32_t src_x,
2774 : int32_t src_y,
2775 : int32_t mask_x,
2776 : int32_t mask_y,
2777 : int32_t dest_x,
2778 : int32_t dest_y,
2779 : int32_t width,
2780 : int32_t height)
2781 : {
2782 : uint8_t *dst_line, *dst;
2783 : uint8_t *mask_line, *mask;
2784 : int dst_stride, mask_stride;
2785 : int32_t w;
2786 : uint32_t src;
2787 : uint8_t sa;
2788 : __m64 vsrc, vsrca;
2789 :
2790 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2791 0 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2792 :
2793 0 : src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2794 :
2795 0 : sa = src >> 24;
2796 :
2797 0 : if (src == 0)
2798 0 : return;
2799 :
2800 0 : vsrc = load8888 (src);
2801 0 : vsrca = expand_alpha (vsrc);
2802 :
2803 0 : while (height--)
2804 : {
2805 0 : dst = dst_line;
2806 0 : dst_line += dst_stride;
2807 0 : mask = mask_line;
2808 0 : mask_line += mask_stride;
2809 0 : w = width;
2810 :
2811 0 : if ((((unsigned long)mask_image & 3) == 0) &&
2812 0 : (((unsigned long)dst_image & 3) == 0))
2813 : {
2814 0 : while (w >= 4)
2815 : {
2816 0 : __m64 vmask = load8888 (*(uint32_t *)mask);
2817 0 : __m64 vdest = load8888 (*(uint32_t *)dst);
2818 :
2819 0 : *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2820 :
2821 0 : w -= 4;
2822 0 : dst += 4;
2823 0 : mask += 4;
2824 : }
2825 : }
2826 :
2827 0 : while (w--)
2828 : {
2829 : uint16_t tmp;
2830 : uint16_t a;
2831 : uint32_t m, d;
2832 : uint32_t r;
2833 :
2834 0 : a = *mask++;
2835 0 : d = *dst;
2836 :
2837 0 : m = MUL_UN8 (sa, a, tmp);
2838 0 : r = ADD_UN8 (m, d, tmp);
2839 :
2840 0 : *dst++ = r;
2841 : }
2842 : }
2843 :
2844 : _mm_empty ();
2845 : }
2846 :
2847 : static void
2848 0 : mmx_composite_add_8_8 (pixman_implementation_t *imp,
2849 : pixman_op_t op,
2850 : pixman_image_t * src_image,
2851 : pixman_image_t * mask_image,
2852 : pixman_image_t * dst_image,
2853 : int32_t src_x,
2854 : int32_t src_y,
2855 : int32_t mask_x,
2856 : int32_t mask_y,
2857 : int32_t dest_x,
2858 : int32_t dest_y,
2859 : int32_t width,
2860 : int32_t height)
2861 : {
2862 : uint8_t *dst_line, *dst;
2863 : uint8_t *src_line, *src;
2864 : int dst_stride, src_stride;
2865 : int32_t w;
2866 : uint8_t s, d;
2867 : uint16_t t;
2868 :
2869 : CHECKPOINT ();
2870 :
2871 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2872 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2873 :
2874 0 : while (height--)
2875 : {
2876 0 : dst = dst_line;
2877 0 : dst_line += dst_stride;
2878 0 : src = src_line;
2879 0 : src_line += src_stride;
2880 0 : w = width;
2881 :
2882 0 : while (w && (unsigned long)dst & 7)
2883 : {
2884 0 : s = *src;
2885 0 : d = *dst;
2886 0 : t = d + s;
2887 0 : s = t | (0 - (t >> 8));
2888 0 : *dst = s;
2889 :
2890 0 : dst++;
2891 0 : src++;
2892 0 : w--;
2893 : }
2894 :
2895 0 : while (w >= 8)
2896 : {
2897 0 : *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2898 0 : dst += 8;
2899 0 : src += 8;
2900 0 : w -= 8;
2901 : }
2902 :
2903 0 : while (w)
2904 : {
2905 0 : s = *src;
2906 0 : d = *dst;
2907 0 : t = d + s;
2908 0 : s = t | (0 - (t >> 8));
2909 0 : *dst = s;
2910 :
2911 0 : dst++;
2912 0 : src++;
2913 0 : w--;
2914 : }
2915 : }
2916 :
2917 : _mm_empty ();
2918 0 : }
2919 :
2920 : static void
2921 0 : mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2922 : pixman_op_t op,
2923 : pixman_image_t * src_image,
2924 : pixman_image_t * mask_image,
2925 : pixman_image_t * dst_image,
2926 : int32_t src_x,
2927 : int32_t src_y,
2928 : int32_t mask_x,
2929 : int32_t mask_y,
2930 : int32_t dest_x,
2931 : int32_t dest_y,
2932 : int32_t width,
2933 : int32_t height)
2934 : {
2935 : __m64 dst64;
2936 : uint32_t *dst_line, *dst;
2937 : uint32_t *src_line, *src;
2938 : int dst_stride, src_stride;
2939 : int32_t w;
2940 :
2941 : CHECKPOINT ();
2942 :
2943 0 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2944 0 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2945 :
2946 0 : while (height--)
2947 : {
2948 0 : dst = dst_line;
2949 0 : dst_line += dst_stride;
2950 0 : src = src_line;
2951 0 : src_line += src_stride;
2952 0 : w = width;
2953 :
2954 0 : while (w && (unsigned long)dst & 7)
2955 : {
2956 0 : *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2957 0 : _mm_cvtsi32_si64 (*dst)));
2958 0 : dst++;
2959 0 : src++;
2960 0 : w--;
2961 : }
2962 :
2963 0 : while (w >= 2)
2964 : {
2965 0 : dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2966 0 : *(uint64_t*)dst = to_uint64 (dst64);
2967 0 : dst += 2;
2968 0 : src += 2;
2969 0 : w -= 2;
2970 : }
2971 :
2972 0 : if (w)
2973 : {
2974 0 : *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2975 0 : _mm_cvtsi32_si64 (*dst)));
2976 :
2977 : }
2978 : }
2979 :
2980 : _mm_empty ();
2981 0 : }
2982 :
2983 : static pixman_bool_t
2984 0 : pixman_blt_mmx (uint32_t *src_bits,
2985 : uint32_t *dst_bits,
2986 : int src_stride,
2987 : int dst_stride,
2988 : int src_bpp,
2989 : int dst_bpp,
2990 : int src_x,
2991 : int src_y,
2992 : int dst_x,
2993 : int dst_y,
2994 : int width,
2995 : int height)
2996 : {
2997 : uint8_t * src_bytes;
2998 : uint8_t * dst_bytes;
2999 : int byte_width;
3000 :
3001 0 : if (src_bpp != dst_bpp)
3002 0 : return FALSE;
3003 :
3004 0 : if (src_bpp == 16)
3005 : {
3006 0 : src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3007 0 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3008 0 : src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3009 0 : dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3010 0 : byte_width = 2 * width;
3011 0 : src_stride *= 2;
3012 0 : dst_stride *= 2;
3013 : }
3014 0 : else if (src_bpp == 32)
3015 : {
3016 0 : src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3017 0 : dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3018 0 : src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3019 0 : dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3020 0 : byte_width = 4 * width;
3021 0 : src_stride *= 4;
3022 0 : dst_stride *= 4;
3023 : }
3024 : else
3025 : {
3026 0 : return FALSE;
3027 : }
3028 :
3029 0 : while (height--)
3030 : {
3031 : int w;
3032 0 : uint8_t *s = src_bytes;
3033 0 : uint8_t *d = dst_bytes;
3034 0 : src_bytes += src_stride;
3035 0 : dst_bytes += dst_stride;
3036 0 : w = byte_width;
3037 :
3038 0 : while (w >= 2 && ((unsigned long)d & 3))
3039 : {
3040 0 : *(uint16_t *)d = *(uint16_t *)s;
3041 0 : w -= 2;
3042 0 : s += 2;
3043 0 : d += 2;
3044 : }
3045 :
3046 0 : while (w >= 4 && ((unsigned long)d & 7))
3047 : {
3048 0 : *(uint32_t *)d = *(uint32_t *)s;
3049 :
3050 0 : w -= 4;
3051 0 : s += 4;
3052 0 : d += 4;
3053 : }
3054 :
3055 0 : while (w >= 64)
3056 : {
3057 : #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
3058 0 : __asm__ (
3059 : "movq (%1), %%mm0\n"
3060 : "movq 8(%1), %%mm1\n"
3061 : "movq 16(%1), %%mm2\n"
3062 : "movq 24(%1), %%mm3\n"
3063 : "movq 32(%1), %%mm4\n"
3064 : "movq 40(%1), %%mm5\n"
3065 : "movq 48(%1), %%mm6\n"
3066 : "movq 56(%1), %%mm7\n"
3067 :
3068 : "movq %%mm0, (%0)\n"
3069 : "movq %%mm1, 8(%0)\n"
3070 : "movq %%mm2, 16(%0)\n"
3071 : "movq %%mm3, 24(%0)\n"
3072 : "movq %%mm4, 32(%0)\n"
3073 : "movq %%mm5, 40(%0)\n"
3074 : "movq %%mm6, 48(%0)\n"
3075 : "movq %%mm7, 56(%0)\n"
3076 : :
3077 : : "r" (d), "r" (s)
3078 : : "memory",
3079 : "%mm0", "%mm1", "%mm2", "%mm3",
3080 : "%mm4", "%mm5", "%mm6", "%mm7");
3081 : #else
3082 : __m64 v0 = *(__m64 *)(s + 0);
3083 : __m64 v1 = *(__m64 *)(s + 8);
3084 : __m64 v2 = *(__m64 *)(s + 16);
3085 : __m64 v3 = *(__m64 *)(s + 24);
3086 : __m64 v4 = *(__m64 *)(s + 32);
3087 : __m64 v5 = *(__m64 *)(s + 40);
3088 : __m64 v6 = *(__m64 *)(s + 48);
3089 : __m64 v7 = *(__m64 *)(s + 56);
3090 : *(__m64 *)(d + 0) = v0;
3091 : *(__m64 *)(d + 8) = v1;
3092 : *(__m64 *)(d + 16) = v2;
3093 : *(__m64 *)(d + 24) = v3;
3094 : *(__m64 *)(d + 32) = v4;
3095 : *(__m64 *)(d + 40) = v5;
3096 : *(__m64 *)(d + 48) = v6;
3097 : *(__m64 *)(d + 56) = v7;
3098 : #endif
3099 :
3100 0 : w -= 64;
3101 0 : s += 64;
3102 0 : d += 64;
3103 : }
3104 0 : while (w >= 4)
3105 : {
3106 0 : *(uint32_t *)d = *(uint32_t *)s;
3107 :
3108 0 : w -= 4;
3109 0 : s += 4;
3110 0 : d += 4;
3111 : }
3112 0 : if (w >= 2)
3113 : {
3114 0 : *(uint16_t *)d = *(uint16_t *)s;
3115 0 : w -= 2;
3116 0 : s += 2;
3117 0 : d += 2;
3118 : }
3119 : }
3120 :
3121 : _mm_empty ();
3122 :
3123 0 : return TRUE;
3124 : }
3125 :
3126 : static void
3127 0 : mmx_composite_copy_area (pixman_implementation_t *imp,
3128 : pixman_op_t op,
3129 : pixman_image_t * src_image,
3130 : pixman_image_t * mask_image,
3131 : pixman_image_t * dst_image,
3132 : int32_t src_x,
3133 : int32_t src_y,
3134 : int32_t mask_x,
3135 : int32_t mask_y,
3136 : int32_t dest_x,
3137 : int32_t dest_y,
3138 : int32_t width,
3139 : int32_t height)
3140 : {
3141 0 : pixman_blt_mmx (src_image->bits.bits,
3142 : dst_image->bits.bits,
3143 : src_image->bits.rowstride,
3144 : dst_image->bits.rowstride,
3145 0 : PIXMAN_FORMAT_BPP (src_image->bits.format),
3146 0 : PIXMAN_FORMAT_BPP (dst_image->bits.format),
3147 : src_x, src_y, dest_x, dest_y, width, height);
3148 0 : }
3149 :
3150 : #if 0
3151 : static void
3152 : mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3153 : pixman_op_t op,
3154 : pixman_image_t * src_image,
3155 : pixman_image_t * mask_image,
3156 : pixman_image_t * dst_image,
3157 : int32_t src_x,
3158 : int32_t src_y,
3159 : int32_t mask_x,
3160 : int32_t mask_y,
3161 : int32_t dest_x,
3162 : int32_t dest_y,
3163 : int32_t width,
3164 : int32_t height)
3165 : {
3166 : uint32_t *src, *src_line;
3167 : uint32_t *dst, *dst_line;
3168 : uint8_t *mask, *mask_line;
3169 : int src_stride, mask_stride, dst_stride;
3170 : int32_t w;
3171 :
3172 : PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3173 : PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3174 : PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3175 :
3176 : while (height--)
3177 : {
3178 : src = src_line;
3179 : src_line += src_stride;
3180 : dst = dst_line;
3181 : dst_line += dst_stride;
3182 : mask = mask_line;
3183 : mask_line += mask_stride;
3184 :
3185 : w = width;
3186 :
3187 : while (w--)
3188 : {
3189 : uint64_t m = *mask;
3190 :
3191 : if (m)
3192 : {
3193 : __m64 s = load8888 (*src | 0xff000000);
3194 :
3195 : if (m == 0xff)
3196 : {
3197 : *dst = store8888 (s);
3198 : }
3199 : else
3200 : {
3201 : __m64 sa = expand_alpha (s);
3202 : __m64 vm = expand_alpha_rev (to_m64 (m));
3203 : __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
3204 :
3205 : *dst = store8888 (vdest);
3206 : }
3207 : }
3208 :
3209 : mask++;
3210 : dst++;
3211 : src++;
3212 : }
3213 : }
3214 :
3215 : _mm_empty ();
3216 : }
3217 : #endif
3218 :
3219 : static const pixman_fast_path_t mmx_fast_paths[] =
3220 : {
3221 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3222 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3223 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3224 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3225 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3226 : PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3227 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3228 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3229 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3230 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3231 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3232 : PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3233 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3234 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3235 : PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3236 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3237 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3238 : PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3239 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3240 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3241 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3242 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3243 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3244 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3245 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3246 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3247 : #if 0
3248 : /* FIXME: This code is commented out since it's apparently
3249 : * not actually faster than the generic code.
3250 : */
3251 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3252 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3253 : PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3254 : PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3255 : #endif
3256 : PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3257 : PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3258 : PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3259 : PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3260 : PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3261 :
3262 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3263 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3264 : PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3265 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3266 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3267 : PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3268 :
3269 : PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3270 : PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3271 : PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3272 : PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3273 :
3274 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3275 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3276 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3277 : PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3278 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3279 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3280 : PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3281 : PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3282 : PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3283 : PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3284 : PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3285 : PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3286 :
3287 : PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3288 : PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3289 :
3290 : { PIXMAN_OP_NONE },
3291 : };
3292 :
3293 : static pixman_bool_t
3294 0 : mmx_blt (pixman_implementation_t *imp,
3295 : uint32_t * src_bits,
3296 : uint32_t * dst_bits,
3297 : int src_stride,
3298 : int dst_stride,
3299 : int src_bpp,
3300 : int dst_bpp,
3301 : int src_x,
3302 : int src_y,
3303 : int dst_x,
3304 : int dst_y,
3305 : int width,
3306 : int height)
3307 : {
3308 0 : if (!pixman_blt_mmx (
3309 : src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3310 : src_x, src_y, dst_x, dst_y, width, height))
3311 :
3312 : {
3313 0 : return _pixman_implementation_blt (
3314 : imp->delegate,
3315 : src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3316 : src_x, src_y, dst_x, dst_y, width, height);
3317 : }
3318 :
3319 0 : return TRUE;
3320 : }
3321 :
3322 : static pixman_bool_t
3323 0 : mmx_fill (pixman_implementation_t *imp,
3324 : uint32_t * bits,
3325 : int stride,
3326 : int bpp,
3327 : int x,
3328 : int y,
3329 : int width,
3330 : int height,
3331 : uint32_t xor)
3332 : {
3333 0 : if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3334 : {
3335 0 : return _pixman_implementation_fill (
3336 : imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3337 : }
3338 :
3339 0 : return TRUE;
3340 : }
3341 :
3342 : pixman_implementation_t *
3343 4 : _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
3344 : {
3345 4 : pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
3346 :
3347 4 : imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3348 4 : imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3349 4 : imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3350 4 : imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3351 4 : imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3352 4 : imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3353 4 : imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3354 4 : imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3355 4 : imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3356 4 : imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3357 4 : imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3358 :
3359 4 : imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3360 4 : imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3361 4 : imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3362 4 : imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3363 4 : imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3364 4 : imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3365 4 : imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3366 4 : imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3367 4 : imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3368 4 : imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3369 4 : imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3370 :
3371 4 : imp->blt = mmx_blt;
3372 4 : imp->fill = mmx_fill;
3373 :
3374 4 : return imp;
3375 : }
3376 :
3377 : #endif /* USE_MMX */
|