LCOV - code coverage report
Current view: directory - gfx/cairo/libpixman/src - pixman-mmx.c (source / functions) Found Hit Coverage
Test: app.info Lines: 1254 27 2.2 %
Date: 2012-06-02 Functions: 46 1 2.2 %

       1                 : /*
       2                 :  * Copyright © 2004, 2005 Red Hat, Inc.
       3                 :  * Copyright © 2004 Nicholas Miell
       4                 :  * Copyright © 2005 Trolltech AS
       5                 :  *
       6                 :  * Permission to use, copy, modify, distribute, and sell this software and its
       7                 :  * documentation for any purpose is hereby granted without fee, provided that
       8                 :  * the above copyright notice appear in all copies and that both that
       9                 :  * copyright notice and this permission notice appear in supporting
      10                 :  * documentation, and that the name of Red Hat not be used in advertising or
      11                 :  * publicity pertaining to distribution of the software without specific,
      12                 :  * written prior permission.  Red Hat makes no representations about the
      13                 :  * suitability of this software for any purpose.  It is provided "as is"
      14                 :  * without express or implied warranty.
      15                 :  *
      16                 :  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
      17                 :  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      18                 :  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
      19                 :  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      20                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
      21                 :  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
      22                 :  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
      23                 :  * SOFTWARE.
      24                 :  *
      25                 :  * Author:  Søren Sandmann (sandmann@redhat.com)
      26                 :  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
      27                 :  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
      28                 :  *
      29                 :  * Based on work by Owen Taylor
      30                 :  */
      31                 : 
      32                 : #ifdef HAVE_CONFIG_H
      33                 : #include <config.h>
      34                 : #endif
      35                 : 
      36                 : #ifdef USE_MMX
      37                 : 
      38                 : #include <mmintrin.h>
      39                 : #include "pixman-private.h"
      40                 : #include "pixman-combine32.h"
      41                 : 
      42                 : #define no_vERBOSE
      43                 : 
      44                 : #ifdef VERBOSE
      45                 : #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
      46                 : #else
      47                 : #define CHECKPOINT()
      48                 : #endif
      49                 : 
      50                 : /* Notes about writing mmx code
      51                 :  *
      52                 :  * give memory operands as the second operand. If you give it as the
      53                 :  * first, gcc will first load it into a register, then use that
      54                 :  * register
      55                 :  *
      56                 :  *   ie. use
      57                 :  *
      58                 :  *         _mm_mullo_pi16 (x, mmx_constant);
      59                 :  *
      60                 :  *   not
      61                 :  *
      62                 :  *         _mm_mullo_pi16 (mmx_constant, x);
      63                 :  *
      64                 :  * Also try to minimize dependencies. i.e. when you need a value, try
      65                 :  * to calculate it from a value that was calculated as early as
      66                 :  * possible.
      67                 :  */
      68                 : 
      69                 : /* --------------- MMX primitives ------------------------------------- */
      70                 : 
      71                 : #ifdef __GNUC__
      72                 : typedef uint64_t mmxdatafield;
      73                 : #else
      74                 : typedef __m64 mmxdatafield;
      75                 : /* If __m64 is defined as a struct or union, define M64_MEMBER to be the
      76                 :    name of the member used to access the data */
      77                 : # ifdef _MSC_VER
      78                 : #  define M64_MEMBER m64_u64
      79                 : # elif defined(__SUNPRO_C)
      80                 : #  define M64_MEMBER l_
      81                 : # endif
      82                 : #endif
      83                 : 
      84                 : typedef struct
      85                 : {
      86                 :     mmxdatafield mmx_4x00ff;
      87                 :     mmxdatafield mmx_4x0080;
      88                 :     mmxdatafield mmx_565_rgb;
      89                 :     mmxdatafield mmx_565_unpack_multiplier;
      90                 :     mmxdatafield mmx_565_r;
      91                 :     mmxdatafield mmx_565_g;
      92                 :     mmxdatafield mmx_565_b;
      93                 :     mmxdatafield mmx_mask_0;
      94                 :     mmxdatafield mmx_mask_1;
      95                 :     mmxdatafield mmx_mask_2;
      96                 :     mmxdatafield mmx_mask_3;
      97                 :     mmxdatafield mmx_full_alpha;
      98                 :     mmxdatafield mmx_ffff0000ffff0000;
      99                 :     mmxdatafield mmx_0000ffff00000000;
     100                 :     mmxdatafield mmx_000000000000ffff;
     101                 : } mmx_data_t;
     102                 : 
     103                 : #if defined(_MSC_VER)
     104                 : # define MMXDATA_INIT(field, val) { val ## UI64 }
     105                 : #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
     106                 : # define MMXDATA_INIT(field, val) field =   { val ## ULL }
     107                 : #else                           /* __m64 is an integral type */
     108                 : # define MMXDATA_INIT(field, val) field =   val ## ULL
     109                 : #endif
     110                 : 
     111                 : static const mmx_data_t c =
     112                 : {
     113                 :     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
     114                 :     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
     115                 :     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
     116                 :     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
     117                 :     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
     118                 :     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
     119                 :     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
     120                 :     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
     121                 :     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
     122                 :     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
     123                 :     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
     124                 :     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
     125                 :     MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
     126                 :     MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
     127                 :     MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
     128                 : };
     129                 : 
     130                 : #ifdef __GNUC__
     131                 : #    ifdef __ICC
     132                 : #        define MC(x) to_m64 (c.mmx_ ## x)
     133                 : #    else
     134                 : #        define MC(x) ((__m64)c.mmx_ ## x)
     135                 : #    endif
     136                 : #else
     137                 : #    define MC(x) c.mmx_ ## x
     138                 : #endif
     139                 : 
     140                 : static force_inline __m64
     141                 : to_m64 (uint64_t x)
     142                 : {
     143                 : #ifdef __ICC
     144                 :     return _mm_cvtsi64_m64 (x);
     145                 : #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     146                 :     __m64 res;
     147                 : 
     148                 :     res.M64_MEMBER = x;
     149                 :     return res;
     150                 : #else                           /* __m64 is an integral type */
     151               0 :     return (__m64)x;
     152                 : #endif
     153                 : }
     154                 : 
     155                 : static force_inline uint64_t
     156                 : to_uint64 (__m64 x)
     157                 : {
     158                 : #ifdef __ICC
     159                 :     return _mm_cvtm64_si64 (x);
     160                 : #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     161                 :     uint64_t res = x.M64_MEMBER;
     162                 :     return res;
     163                 : #else                           /* __m64 is an integral type */
     164               0 :     return (uint64_t)x;
     165                 : #endif
     166                 : }
     167                 : 
     168                 : static force_inline __m64
     169                 : shift (__m64 v,
     170                 :        int   s)
     171                 : {
     172               0 :     if (s > 0)
     173                 :         return _mm_slli_si64 (v, s);
     174               0 :     else if (s < 0)
     175               0 :         return _mm_srli_si64 (v, -s);
     176                 :     else
     177               0 :         return v;
     178                 : }
     179                 : 
     180                 : static force_inline __m64
     181                 : negate (__m64 mask)
     182                 : {
     183               0 :     return _mm_xor_si64 (mask, MC (4x00ff));
     184                 : }
     185                 : 
     186                 : static force_inline __m64
     187                 : pix_multiply (__m64 a, __m64 b)
     188                 : {
     189                 :     __m64 res;
     190                 : 
     191               0 :     res = _mm_mullo_pi16 (a, b);
     192               0 :     res = _mm_adds_pu16 (res, MC (4x0080));
     193               0 :     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
     194               0 :     res = _mm_srli_pi16 (res, 8);
     195                 : 
     196               0 :     return res;
     197                 : }
     198                 : 
     199                 : static force_inline __m64
     200                 : pix_add (__m64 a, __m64 b)
     201                 : {
     202                 :     return _mm_adds_pu8 (a, b);
     203                 : }
     204                 : 
     205                 : static force_inline __m64
     206                 : expand_alpha (__m64 pixel)
     207                 : {
     208                 :     __m64 t1, t2;
     209                 : 
     210               0 :     t1 = shift (pixel, -48);
     211               0 :     t2 = shift (t1, 16);
     212               0 :     t1 = _mm_or_si64 (t1, t2);
     213               0 :     t2 = shift (t1, 32);
     214               0 :     t1 = _mm_or_si64 (t1, t2);
     215                 : 
     216               0 :     return t1;
     217                 : }
     218                 : 
     219                 : static force_inline __m64
     220                 : expand_alpha_rev (__m64 pixel)
     221                 : {
     222                 :     __m64 t1, t2;
     223                 : 
     224                 :     /* move alpha to low 16 bits and zero the rest */
     225               0 :     t1 = shift (pixel,  48);
     226               0 :     t1 = shift (t1, -48);
     227                 : 
     228               0 :     t2 = shift (t1, 16);
     229               0 :     t1 = _mm_or_si64 (t1, t2);
     230               0 :     t2 = shift (t1, 32);
     231               0 :     t1 = _mm_or_si64 (t1, t2);
     232                 : 
     233               0 :     return t1;
     234                 : }
     235                 : 
     236                 : static force_inline __m64
     237                 : invert_colors (__m64 pixel)
     238                 : {
     239                 :     __m64 x, y, z;
     240                 : 
     241               0 :     x = y = z = pixel;
     242                 : 
     243               0 :     x = _mm_and_si64 (x, MC (ffff0000ffff0000));
     244               0 :     y = _mm_and_si64 (y, MC (000000000000ffff));
     245               0 :     z = _mm_and_si64 (z, MC (0000ffff00000000));
     246                 : 
     247               0 :     y = shift (y, 32);
     248               0 :     z = shift (z, -32);
     249                 : 
     250               0 :     x = _mm_or_si64 (x, y);
     251               0 :     x = _mm_or_si64 (x, z);
     252                 : 
     253               0 :     return x;
     254                 : }
     255                 : 
     256                 : static force_inline __m64
     257                 : over (__m64 src,
     258                 :       __m64 srca,
     259                 :       __m64 dest)
     260                 : {
     261                 :     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
     262                 : }
     263                 : 
     264                 : static force_inline __m64
     265                 : over_rev_non_pre (__m64 src, __m64 dest)
     266                 : {
     267               0 :     __m64 srca = expand_alpha (src);
     268               0 :     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
     269                 : 
     270               0 :     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
     271                 : }
     272                 : 
     273                 : static force_inline __m64
     274                 : in (__m64 src, __m64 mask)
     275                 : {
     276                 :     return pix_multiply (src, mask);
     277                 : }
     278                 : 
     279                 : static force_inline __m64
     280                 : in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
     281                 : {
     282                 :     src = _mm_or_si64 (src, MC (full_alpha));
     283                 : 
     284                 :     return over (in (src, mask), mask, dest);
     285                 : }
     286                 : 
     287                 : #ifndef _MSC_VER
     288                 : static force_inline __m64
     289                 : in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
     290                 : {
     291                 :     return over (in (src, mask), pix_multiply (srca, mask), dest);
     292                 : }
     293                 : 
     294                 : #else
     295                 : 
     296                 : #define in_over(src, srca, mask, dest)                                  \
     297                 :     over (in (src, mask), pix_multiply (srca, mask), dest)
     298                 : 
     299                 : #endif
     300                 : 
     301                 : static force_inline __m64
     302                 : load8888 (uint32_t v)
     303                 : {
     304               0 :     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
     305                 : }
     306                 : 
     307                 : static force_inline __m64
     308                 : pack8888 (__m64 lo, __m64 hi)
     309                 : {
     310                 :     return _mm_packs_pu16 (lo, hi);
     311                 : }
     312                 : 
     313                 : static force_inline uint32_t
     314                 : store8888 (__m64 v)
     315                 : {
     316               0 :     return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
     317                 : }
     318                 : 
     319                 : /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
     320                 :  *
     321                 :  *    00RR00GG00BB
     322                 :  *
     323                 :  * --- Expanding 565 in the low word ---
     324                 :  *
     325                 :  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
     326                 :  * m = m & (01f0003f001f);
     327                 :  * m = m * (008404100840);
     328                 :  * m = m >> 8;
     329                 :  *
     330                 :  * Note the trick here - the top word is shifted by another nibble to
     331                 :  * avoid it bumping into the middle word
     332                 :  */
     333                 : static force_inline __m64
     334                 : expand565 (__m64 pixel, int pos)
     335                 : {
     336               0 :     __m64 p = pixel;
     337                 :     __m64 t1, t2;
     338                 : 
     339                 :     /* move pixel to low 16 bit and zero the rest */
     340               0 :     p = shift (shift (p, (3 - pos) * 16), -48);
     341                 : 
     342               0 :     t1 = shift (p, 36 - 11);
     343               0 :     t2 = shift (p, 16 - 5);
     344                 : 
     345               0 :     p = _mm_or_si64 (t1, p);
     346               0 :     p = _mm_or_si64 (t2, p);
     347               0 :     p = _mm_and_si64 (p, MC (565_rgb));
     348                 : 
     349               0 :     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
     350                 :     return _mm_srli_pi16 (pixel, 8);
     351                 : }
     352                 : 
     353                 : static force_inline __m64
     354                 : expand8888 (__m64 in, int pos)
     355                 : {
     356               0 :     if (pos == 0)
     357                 :         return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
     358                 :     else
     359                 :         return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
     360                 : }
     361                 : 
     362                 : static force_inline __m64
     363                 : expandx888 (__m64 in, int pos)
     364                 : {
     365               0 :     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
     366                 : }
     367                 : 
     368                 : static force_inline __m64
     369                 : pack_565 (__m64 pixel, __m64 target, int pos)
     370                 : {
     371               0 :     __m64 p = pixel;
     372               0 :     __m64 t = target;
     373                 :     __m64 r, g, b;
     374                 : 
     375               0 :     r = _mm_and_si64 (p, MC (565_r));
     376               0 :     g = _mm_and_si64 (p, MC (565_g));
     377               0 :     b = _mm_and_si64 (p, MC (565_b));
     378                 : 
     379               0 :     r = shift (r, -(32 - 8) + pos * 16);
     380               0 :     g = shift (g, -(16 - 3) + pos * 16);
     381               0 :     b = shift (b, -(0  + 3) + pos * 16);
     382                 : 
     383               0 :     if (pos == 0)
     384               0 :         t = _mm_and_si64 (t, MC (mask_0));
     385               0 :     else if (pos == 1)
     386               0 :         t = _mm_and_si64 (t, MC (mask_1));
     387               0 :     else if (pos == 2)
     388               0 :         t = _mm_and_si64 (t, MC (mask_2));
     389               0 :     else if (pos == 3)
     390               0 :         t = _mm_and_si64 (t, MC (mask_3));
     391                 : 
     392               0 :     p = _mm_or_si64 (r, t);
     393               0 :     p = _mm_or_si64 (g, p);
     394                 : 
     395               0 :     return _mm_or_si64 (b, p);
     396                 : }
     397                 : 
     398                 : #ifndef _MSC_VER
     399                 : 
     400                 : static force_inline __m64
     401                 : pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
     402                 : {
     403                 :     x = pix_multiply (x, a);
     404                 :     y = pix_multiply (y, b);
     405                 : 
     406                 :     return pix_add (x, y);
     407                 : }
     408                 : 
     409                 : #else
     410                 : 
     411                 : #define pix_add_mul(x, a, y, b)  \
     412                 :     ( x = pix_multiply (x, a),   \
     413                 :       y = pix_multiply (y, a),   \
     414                 :       pix_add (x, y) )
     415                 : 
     416                 : #endif
     417                 : 
     418                 : /* --------------- MMX code patch for fbcompose.c --------------------- */
     419                 : 
     420                 : static force_inline uint32_t
     421                 : combine (const uint32_t *src, const uint32_t *mask)
     422                 : {
     423               0 :     uint32_t ssrc = *src;
     424                 : 
     425               0 :     if (mask)
     426                 :     {
     427               0 :         __m64 m = load8888 (*mask);
     428               0 :         __m64 s = load8888 (ssrc);
     429                 : 
     430               0 :         m = expand_alpha (m);
     431               0 :         s = pix_multiply (s, m);
     432                 : 
     433               0 :         ssrc = store8888 (s);
     434                 :     }
     435                 : 
     436               0 :     return ssrc;
     437                 : }
     438                 : 
     439                 : static void
     440               0 : mmx_combine_over_u (pixman_implementation_t *imp,
     441                 :                     pixman_op_t              op,
     442                 :                     uint32_t *               dest,
     443                 :                     const uint32_t *         src,
     444                 :                     const uint32_t *         mask,
     445                 :                     int                      width)
     446                 : {
     447               0 :     const uint32_t *end = dest + width;
     448                 : 
     449               0 :     while (dest < end)
     450                 :     {
     451                 :         uint32_t ssrc = combine (src, mask);
     452               0 :         uint32_t a = ssrc >> 24;
     453                 : 
     454               0 :         if (a == 0xff)
     455                 :         {
     456               0 :             *dest = ssrc;
     457                 :         }
     458               0 :         else if (ssrc)
     459                 :         {
     460                 :             __m64 s, sa;
     461               0 :             s = load8888 (ssrc);
     462               0 :             sa = expand_alpha (s);
     463               0 :             *dest = store8888 (over (s, sa, load8888 (*dest)));
     464                 :         }
     465                 : 
     466               0 :         ++dest;
     467               0 :         ++src;
     468               0 :         if (mask)
     469               0 :             ++mask;
     470                 :     }
     471                 :     _mm_empty ();
     472               0 : }
     473                 : 
     474                 : static void
     475               0 : mmx_combine_over_reverse_u (pixman_implementation_t *imp,
     476                 :                             pixman_op_t              op,
     477                 :                             uint32_t *               dest,
     478                 :                             const uint32_t *         src,
     479                 :                             const uint32_t *         mask,
     480                 :                             int                      width)
     481                 : {
     482               0 :     const uint32_t *end = dest + width;
     483                 : 
     484               0 :     while (dest < end)
     485                 :     {
     486                 :         __m64 d, da;
     487                 :         uint32_t s = combine (src, mask);
     488                 : 
     489               0 :         d = load8888 (*dest);
     490               0 :         da = expand_alpha (d);
     491               0 :         *dest = store8888 (over (d, da, load8888 (s)));
     492                 : 
     493               0 :         ++dest;
     494               0 :         ++src;
     495               0 :         if (mask)
     496               0 :             mask++;
     497                 :     }
     498                 :     _mm_empty ();
     499               0 : }
     500                 : 
     501                 : static void
     502               0 : mmx_combine_in_u (pixman_implementation_t *imp,
     503                 :                   pixman_op_t              op,
     504                 :                   uint32_t *               dest,
     505                 :                   const uint32_t *         src,
     506                 :                   const uint32_t *         mask,
     507                 :                   int                      width)
     508                 : {
     509               0 :     const uint32_t *end = dest + width;
     510                 : 
     511               0 :     while (dest < end)
     512                 :     {
     513                 :         __m64 x, a;
     514                 : 
     515               0 :         x = load8888 (combine (src, mask));
     516               0 :         a = load8888 (*dest);
     517               0 :         a = expand_alpha (a);
     518               0 :         x = pix_multiply (x, a);
     519                 : 
     520               0 :         *dest = store8888 (x);
     521                 : 
     522               0 :         ++dest;
     523               0 :         ++src;
     524               0 :         if (mask)
     525               0 :             mask++;
     526                 :     }
     527                 :     _mm_empty ();
     528               0 : }
     529                 : 
     530                 : static void
     531               0 : mmx_combine_in_reverse_u (pixman_implementation_t *imp,
     532                 :                           pixman_op_t              op,
     533                 :                           uint32_t *               dest,
     534                 :                           const uint32_t *         src,
     535                 :                           const uint32_t *         mask,
     536                 :                           int                      width)
     537                 : {
     538               0 :     const uint32_t *end = dest + width;
     539                 : 
     540               0 :     while (dest < end)
     541                 :     {
     542                 :         __m64 x, a;
     543                 : 
     544               0 :         x = load8888 (*dest);
     545               0 :         a = load8888 (combine (src, mask));
     546               0 :         a = expand_alpha (a);
     547               0 :         x = pix_multiply (x, a);
     548               0 :         *dest = store8888 (x);
     549                 : 
     550               0 :         ++dest;
     551               0 :         ++src;
     552               0 :         if (mask)
     553               0 :             mask++;
     554                 :     }
     555                 :     _mm_empty ();
     556               0 : }
     557                 : 
     558                 : static void
     559               0 : mmx_combine_out_u (pixman_implementation_t *imp,
     560                 :                    pixman_op_t              op,
     561                 :                    uint32_t *               dest,
     562                 :                    const uint32_t *         src,
     563                 :                    const uint32_t *         mask,
     564                 :                    int                      width)
     565                 : {
     566               0 :     const uint32_t *end = dest + width;
     567                 : 
     568               0 :     while (dest < end)
     569                 :     {
     570                 :         __m64 x, a;
     571                 : 
     572               0 :         x = load8888 (combine (src, mask));
     573               0 :         a = load8888 (*dest);
     574               0 :         a = expand_alpha (a);
     575               0 :         a = negate (a);
     576               0 :         x = pix_multiply (x, a);
     577               0 :         *dest = store8888 (x);
     578                 : 
     579               0 :         ++dest;
     580               0 :         ++src;
     581               0 :         if (mask)
     582               0 :             mask++;
     583                 :     }
     584                 :     _mm_empty ();
     585               0 : }
     586                 : 
     587                 : static void
     588               0 : mmx_combine_out_reverse_u (pixman_implementation_t *imp,
     589                 :                            pixman_op_t              op,
     590                 :                            uint32_t *               dest,
     591                 :                            const uint32_t *         src,
     592                 :                            const uint32_t *         mask,
     593                 :                            int                      width)
     594                 : {
     595               0 :     const uint32_t *end = dest + width;
     596                 : 
     597               0 :     while (dest < end)
     598                 :     {
     599                 :         __m64 x, a;
     600                 : 
     601               0 :         x = load8888 (*dest);
     602               0 :         a = load8888 (combine (src, mask));
     603               0 :         a = expand_alpha (a);
     604               0 :         a = negate (a);
     605               0 :         x = pix_multiply (x, a);
     606                 : 
     607               0 :         *dest = store8888 (x);
     608                 : 
     609               0 :         ++dest;
     610               0 :         ++src;
     611               0 :         if (mask)
     612               0 :             mask++;
     613                 :     }
     614                 :     _mm_empty ();
     615               0 : }
     616                 : 
     617                 : static void
     618               0 : mmx_combine_atop_u (pixman_implementation_t *imp,
     619                 :                     pixman_op_t              op,
     620                 :                     uint32_t *               dest,
     621                 :                     const uint32_t *         src,
     622                 :                     const uint32_t *         mask,
     623                 :                     int                      width)
     624                 : {
     625               0 :     const uint32_t *end = dest + width;
     626                 : 
     627               0 :     while (dest < end)
     628                 :     {
     629                 :         __m64 s, da, d, sia;
     630                 : 
     631               0 :         s = load8888 (combine (src, mask));
     632               0 :         d = load8888 (*dest);
     633               0 :         sia = expand_alpha (s);
     634               0 :         sia = negate (sia);
     635               0 :         da = expand_alpha (d);
     636               0 :         s = pix_add_mul (s, da, d, sia);
     637               0 :         *dest = store8888 (s);
     638                 : 
     639               0 :         ++dest;
     640               0 :         ++src;
     641               0 :         if (mask)
     642               0 :             mask++;
     643                 :     }
     644                 :     _mm_empty ();
     645               0 : }
     646                 : 
     647                 : static void
     648               0 : mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
     649                 :                             pixman_op_t              op,
     650                 :                             uint32_t *               dest,
     651                 :                             const uint32_t *         src,
     652                 :                             const uint32_t *         mask,
     653                 :                             int                      width)
     654                 : {
     655                 :     const uint32_t *end;
     656                 : 
     657               0 :     end = dest + width;
     658                 : 
     659               0 :     while (dest < end)
     660                 :     {
     661                 :         __m64 s, dia, d, sa;
     662                 : 
     663               0 :         s = load8888 (combine (src, mask));
     664               0 :         d = load8888 (*dest);
     665               0 :         sa = expand_alpha (s);
     666               0 :         dia = expand_alpha (d);
     667               0 :         dia = negate (dia);
     668               0 :         s = pix_add_mul (s, dia, d, sa);
     669               0 :         *dest = store8888 (s);
     670                 : 
     671               0 :         ++dest;
     672               0 :         ++src;
     673               0 :         if (mask)
     674               0 :             mask++;
     675                 :     }
     676                 :     _mm_empty ();
     677               0 : }
     678                 : 
     679                 : static void
     680               0 : mmx_combine_xor_u (pixman_implementation_t *imp,
     681                 :                    pixman_op_t              op,
     682                 :                    uint32_t *               dest,
     683                 :                    const uint32_t *         src,
     684                 :                    const uint32_t *         mask,
     685                 :                    int                      width)
     686                 : {
     687               0 :     const uint32_t *end = dest + width;
     688                 : 
     689               0 :     while (dest < end)
     690                 :     {
     691                 :         __m64 s, dia, d, sia;
     692                 : 
     693               0 :         s = load8888 (combine (src, mask));
     694               0 :         d = load8888 (*dest);
     695               0 :         sia = expand_alpha (s);
     696               0 :         dia = expand_alpha (d);
     697               0 :         sia = negate (sia);
     698               0 :         dia = negate (dia);
     699               0 :         s = pix_add_mul (s, dia, d, sia);
     700               0 :         *dest = store8888 (s);
     701                 : 
     702               0 :         ++dest;
     703               0 :         ++src;
     704               0 :         if (mask)
     705               0 :             mask++;
     706                 :     }
     707                 :     _mm_empty ();
     708               0 : }
     709                 : 
     710                 : static void
     711               0 : mmx_combine_add_u (pixman_implementation_t *imp,
     712                 :                    pixman_op_t              op,
     713                 :                    uint32_t *               dest,
     714                 :                    const uint32_t *         src,
     715                 :                    const uint32_t *         mask,
     716                 :                    int                      width)
     717                 : {
     718               0 :     const uint32_t *end = dest + width;
     719                 : 
     720               0 :     while (dest < end)
     721                 :     {
     722                 :         __m64 s, d;
     723                 : 
     724               0 :         s = load8888 (combine (src, mask));
     725               0 :         d = load8888 (*dest);
     726               0 :         s = pix_add (s, d);
     727               0 :         *dest = store8888 (s);
     728                 : 
     729               0 :         ++dest;
     730               0 :         ++src;
     731               0 :         if (mask)
     732               0 :             mask++;
     733                 :     }
     734                 :     _mm_empty ();
     735               0 : }
     736                 : 
     737                 : static void
     738               0 : mmx_combine_saturate_u (pixman_implementation_t *imp,
     739                 :                         pixman_op_t              op,
     740                 :                         uint32_t *               dest,
     741                 :                         const uint32_t *         src,
     742                 :                         const uint32_t *         mask,
     743                 :                         int                      width)
     744                 : {
     745               0 :     const uint32_t *end = dest + width;
     746                 : 
     747               0 :     while (dest < end)
     748                 :     {
     749                 :         uint32_t s = combine (src, mask);
     750               0 :         uint32_t d = *dest;
     751               0 :         __m64 ms = load8888 (s);
     752               0 :         __m64 md = load8888 (d);
     753               0 :         uint32_t sa = s >> 24;
     754               0 :         uint32_t da = ~d >> 24;
     755                 : 
     756               0 :         if (sa > da)
     757                 :         {
     758               0 :             __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
     759               0 :             msa = expand_alpha (msa);
     760               0 :             ms = pix_multiply (ms, msa);
     761                 :         }
     762                 : 
     763               0 :         md = pix_add (md, ms);
     764               0 :         *dest = store8888 (md);
     765                 : 
     766               0 :         ++src;
     767               0 :         ++dest;
     768               0 :         if (mask)
     769               0 :             mask++;
     770                 :     }
     771                 :     _mm_empty ();
     772               0 : }
     773                 : 
     774                 : static void
     775               0 : mmx_combine_src_ca (pixman_implementation_t *imp,
     776                 :                     pixman_op_t              op,
     777                 :                     uint32_t *               dest,
     778                 :                     const uint32_t *         src,
     779                 :                     const uint32_t *         mask,
     780                 :                     int                      width)
     781                 : {
     782               0 :     const uint32_t *end = src + width;
     783                 : 
     784               0 :     while (src < end)
     785                 :     {
     786               0 :         __m64 a = load8888 (*mask);
     787               0 :         __m64 s = load8888 (*src);
     788                 : 
     789               0 :         s = pix_multiply (s, a);
     790               0 :         *dest = store8888 (s);
     791                 : 
     792               0 :         ++src;
     793               0 :         ++mask;
     794               0 :         ++dest;
     795                 :     }
     796                 :     _mm_empty ();
     797               0 : }
     798                 : 
     799                 : static void
     800               0 : mmx_combine_over_ca (pixman_implementation_t *imp,
     801                 :                      pixman_op_t              op,
     802                 :                      uint32_t *               dest,
     803                 :                      const uint32_t *         src,
     804                 :                      const uint32_t *         mask,
     805                 :                      int                      width)
     806                 : {
     807               0 :     const uint32_t *end = src + width;
     808                 : 
     809               0 :     while (src < end)
     810                 :     {
     811               0 :         __m64 a = load8888 (*mask);
     812               0 :         __m64 s = load8888 (*src);
     813               0 :         __m64 d = load8888 (*dest);
     814               0 :         __m64 sa = expand_alpha (s);
     815                 : 
     816               0 :         *dest = store8888 (in_over (s, sa, a, d));
     817                 : 
     818               0 :         ++src;
     819               0 :         ++dest;
     820               0 :         ++mask;
     821                 :     }
     822                 :     _mm_empty ();
     823               0 : }
     824                 : 
     825                 : static void
     826               0 : mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     827                 :                              pixman_op_t              op,
     828                 :                              uint32_t *               dest,
     829                 :                              const uint32_t *         src,
     830                 :                              const uint32_t *         mask,
     831                 :                              int                      width)
     832                 : {
     833               0 :     const uint32_t *end = src + width;
     834                 : 
     835               0 :     while (src < end)
     836                 :     {
     837               0 :         __m64 a = load8888 (*mask);
     838               0 :         __m64 s = load8888 (*src);
     839               0 :         __m64 d = load8888 (*dest);
     840               0 :         __m64 da = expand_alpha (d);
     841                 : 
     842               0 :         *dest = store8888 (over (d, da, in (s, a)));
     843                 : 
     844               0 :         ++src;
     845               0 :         ++dest;
     846               0 :         ++mask;
     847                 :     }
     848                 :     _mm_empty ();
     849               0 : }
     850                 : 
     851                 : static void
     852               0 : mmx_combine_in_ca (pixman_implementation_t *imp,
     853                 :                    pixman_op_t              op,
     854                 :                    uint32_t *               dest,
     855                 :                    const uint32_t *         src,
     856                 :                    const uint32_t *         mask,
     857                 :                    int                      width)
     858                 : {
     859               0 :     const uint32_t *end = src + width;
     860                 : 
     861               0 :     while (src < end)
     862                 :     {
     863               0 :         __m64 a = load8888 (*mask);
     864               0 :         __m64 s = load8888 (*src);
     865               0 :         __m64 d = load8888 (*dest);
     866               0 :         __m64 da = expand_alpha (d);
     867                 : 
     868               0 :         s = pix_multiply (s, a);
     869               0 :         s = pix_multiply (s, da);
     870               0 :         *dest = store8888 (s);
     871                 : 
     872               0 :         ++src;
     873               0 :         ++dest;
     874               0 :         ++mask;
     875                 :     }
     876                 :     _mm_empty ();
     877               0 : }
     878                 : 
     879                 : static void
     880               0 : mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
     881                 :                            pixman_op_t              op,
     882                 :                            uint32_t *               dest,
     883                 :                            const uint32_t *         src,
     884                 :                            const uint32_t *         mask,
     885                 :                            int                      width)
     886                 : {
     887               0 :     const uint32_t *end = src + width;
     888                 : 
     889               0 :     while (src < end)
     890                 :     {
     891               0 :         __m64 a = load8888 (*mask);
     892               0 :         __m64 s = load8888 (*src);
     893               0 :         __m64 d = load8888 (*dest);
     894               0 :         __m64 sa = expand_alpha (s);
     895                 : 
     896               0 :         a = pix_multiply (a, sa);
     897               0 :         d = pix_multiply (d, a);
     898               0 :         *dest = store8888 (d);
     899                 : 
     900               0 :         ++src;
     901               0 :         ++dest;
     902               0 :         ++mask;
     903                 :     }
     904                 :     _mm_empty ();
     905               0 : }
     906                 : 
     907                 : static void
     908               0 : mmx_combine_out_ca (pixman_implementation_t *imp,
     909                 :                     pixman_op_t              op,
     910                 :                     uint32_t *               dest,
     911                 :                     const uint32_t *         src,
     912                 :                     const uint32_t *         mask,
     913                 :                     int                      width)
     914                 : {
     915               0 :     const uint32_t *end = src + width;
     916                 : 
     917               0 :     while (src < end)
     918                 :     {
     919               0 :         __m64 a = load8888 (*mask);
     920               0 :         __m64 s = load8888 (*src);
     921               0 :         __m64 d = load8888 (*dest);
     922               0 :         __m64 da = expand_alpha (d);
     923                 : 
     924               0 :         da = negate (da);
     925               0 :         s = pix_multiply (s, a);
     926               0 :         s = pix_multiply (s, da);
     927               0 :         *dest = store8888 (s);
     928                 : 
     929               0 :         ++src;
     930               0 :         ++dest;
     931               0 :         ++mask;
     932                 :     }
     933                 :     _mm_empty ();
     934               0 : }
     935                 : 
     936                 : static void
     937               0 : mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
     938                 :                             pixman_op_t              op,
     939                 :                             uint32_t *               dest,
     940                 :                             const uint32_t *         src,
     941                 :                             const uint32_t *         mask,
     942                 :                             int                      width)
     943                 : {
     944               0 :     const uint32_t *end = src + width;
     945                 : 
     946               0 :     while (src < end)
     947                 :     {
     948               0 :         __m64 a = load8888 (*mask);
     949               0 :         __m64 s = load8888 (*src);
     950               0 :         __m64 d = load8888 (*dest);
     951               0 :         __m64 sa = expand_alpha (s);
     952                 : 
     953               0 :         a = pix_multiply (a, sa);
     954               0 :         a = negate (a);
     955               0 :         d = pix_multiply (d, a);
     956               0 :         *dest = store8888 (d);
     957                 : 
     958               0 :         ++src;
     959               0 :         ++dest;
     960               0 :         ++mask;
     961                 :     }
     962                 :     _mm_empty ();
     963               0 : }
     964                 : 
     965                 : static void
     966               0 : mmx_combine_atop_ca (pixman_implementation_t *imp,
     967                 :                      pixman_op_t              op,
     968                 :                      uint32_t *               dest,
     969                 :                      const uint32_t *         src,
     970                 :                      const uint32_t *         mask,
     971                 :                      int                      width)
     972                 : {
     973               0 :     const uint32_t *end = src + width;
     974                 : 
     975               0 :     while (src < end)
     976                 :     {
     977               0 :         __m64 a = load8888 (*mask);
     978               0 :         __m64 s = load8888 (*src);
     979               0 :         __m64 d = load8888 (*dest);
     980               0 :         __m64 da = expand_alpha (d);
     981               0 :         __m64 sa = expand_alpha (s);
     982                 : 
     983               0 :         s = pix_multiply (s, a);
     984               0 :         a = pix_multiply (a, sa);
     985               0 :         a = negate (a);
     986               0 :         d = pix_add_mul (d, a, s, da);
     987               0 :         *dest = store8888 (d);
     988                 : 
     989               0 :         ++src;
     990               0 :         ++dest;
     991               0 :         ++mask;
     992                 :     }
     993                 :     _mm_empty ();
     994               0 : }
     995                 : 
     996                 : static void
     997               0 : mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
     998                 :                              pixman_op_t              op,
     999                 :                              uint32_t *               dest,
    1000                 :                              const uint32_t *         src,
    1001                 :                              const uint32_t *         mask,
    1002                 :                              int                      width)
    1003                 : {
    1004               0 :     const uint32_t *end = src + width;
    1005                 : 
    1006               0 :     while (src < end)
    1007                 :     {
    1008               0 :         __m64 a = load8888 (*mask);
    1009               0 :         __m64 s = load8888 (*src);
    1010               0 :         __m64 d = load8888 (*dest);
    1011               0 :         __m64 da = expand_alpha (d);
    1012               0 :         __m64 sa = expand_alpha (s);
    1013                 : 
    1014               0 :         s = pix_multiply (s, a);
    1015               0 :         a = pix_multiply (a, sa);
    1016               0 :         da = negate (da);
    1017               0 :         d = pix_add_mul (d, a, s, da);
    1018               0 :         *dest = store8888 (d);
    1019                 : 
    1020               0 :         ++src;
    1021               0 :         ++dest;
    1022               0 :         ++mask;
    1023                 :     }
    1024                 :     _mm_empty ();
    1025               0 : }
    1026                 : 
    1027                 : static void
    1028               0 : mmx_combine_xor_ca (pixman_implementation_t *imp,
    1029                 :                     pixman_op_t              op,
    1030                 :                     uint32_t *               dest,
    1031                 :                     const uint32_t *         src,
    1032                 :                     const uint32_t *         mask,
    1033                 :                     int                      width)
    1034                 : {
    1035               0 :     const uint32_t *end = src + width;
    1036                 : 
    1037               0 :     while (src < end)
    1038                 :     {
    1039               0 :         __m64 a = load8888 (*mask);
    1040               0 :         __m64 s = load8888 (*src);
    1041               0 :         __m64 d = load8888 (*dest);
    1042               0 :         __m64 da = expand_alpha (d);
    1043               0 :         __m64 sa = expand_alpha (s);
    1044                 : 
    1045               0 :         s = pix_multiply (s, a);
    1046               0 :         a = pix_multiply (a, sa);
    1047               0 :         da = negate (da);
    1048               0 :         a = negate (a);
    1049               0 :         d = pix_add_mul (d, a, s, da);
    1050               0 :         *dest = store8888 (d);
    1051                 : 
    1052               0 :         ++src;
    1053               0 :         ++dest;
    1054               0 :         ++mask;
    1055                 :     }
    1056                 :     _mm_empty ();
    1057               0 : }
    1058                 : 
    1059                 : static void
    1060               0 : mmx_combine_add_ca (pixman_implementation_t *imp,
    1061                 :                     pixman_op_t              op,
    1062                 :                     uint32_t *               dest,
    1063                 :                     const uint32_t *         src,
    1064                 :                     const uint32_t *         mask,
    1065                 :                     int                      width)
    1066                 : {
    1067               0 :     const uint32_t *end = src + width;
    1068                 : 
    1069               0 :     while (src < end)
    1070                 :     {
    1071               0 :         __m64 a = load8888 (*mask);
    1072               0 :         __m64 s = load8888 (*src);
    1073               0 :         __m64 d = load8888 (*dest);
    1074                 : 
    1075               0 :         s = pix_multiply (s, a);
    1076               0 :         d = pix_add (s, d);
    1077               0 :         *dest = store8888 (d);
    1078                 : 
    1079               0 :         ++src;
    1080               0 :         ++dest;
    1081               0 :         ++mask;
    1082                 :     }
    1083                 :     _mm_empty ();
    1084               0 : }
    1085                 : 
    1086                 : /* ------------- MMX code paths called from fbpict.c -------------------- */
    1087                 : 
    1088                 : static void
    1089               0 : mmx_composite_over_n_8888 (pixman_implementation_t *imp,
    1090                 :                            pixman_op_t              op,
    1091                 :                            pixman_image_t *         src_image,
    1092                 :                            pixman_image_t *         mask_image,
    1093                 :                            pixman_image_t *         dst_image,
    1094                 :                            int32_t                  src_x,
    1095                 :                            int32_t                  src_y,
    1096                 :                            int32_t                  mask_x,
    1097                 :                            int32_t                  mask_y,
    1098                 :                            int32_t                  dest_x,
    1099                 :                            int32_t                  dest_y,
    1100                 :                            int32_t                  width,
    1101                 :                            int32_t                  height)
    1102                 : {
    1103                 :     uint32_t src;
    1104                 :     uint32_t    *dst_line, *dst;
    1105                 :     int32_t w;
    1106                 :     int dst_stride;
    1107                 :     __m64 vsrc, vsrca;
    1108                 : 
    1109                 :     CHECKPOINT ();
    1110                 : 
    1111               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    1112                 : 
    1113               0 :     if (src == 0)
    1114               0 :         return;
    1115                 : 
    1116               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1117                 : 
    1118               0 :     vsrc = load8888 (src);
    1119               0 :     vsrca = expand_alpha (vsrc);
    1120                 : 
    1121               0 :     while (height--)
    1122                 :     {
    1123               0 :         dst = dst_line;
    1124               0 :         dst_line += dst_stride;
    1125               0 :         w = width;
    1126                 : 
    1127                 :         CHECKPOINT ();
    1128                 : 
    1129               0 :         while (w && (unsigned long)dst & 7)
    1130                 :         {
    1131               0 :             *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
    1132                 : 
    1133               0 :             w--;
    1134               0 :             dst++;
    1135                 :         }
    1136                 : 
    1137               0 :         while (w >= 2)
    1138                 :         {
    1139                 :             __m64 vdest;
    1140                 :             __m64 dest0, dest1;
    1141                 : 
    1142               0 :             vdest = *(__m64 *)dst;
    1143                 : 
    1144               0 :             dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
    1145               0 :             dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
    1146                 : 
    1147               0 :             *(__m64 *)dst = pack8888 (dest0, dest1);
    1148                 : 
    1149               0 :             dst += 2;
    1150               0 :             w -= 2;
    1151                 :         }
    1152                 : 
    1153                 :         CHECKPOINT ();
    1154                 : 
    1155               0 :         while (w)
    1156                 :         {
    1157               0 :             *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
    1158                 : 
    1159               0 :             w--;
    1160               0 :             dst++;
    1161                 :         }
    1162                 :     }
    1163                 : 
    1164                 :     _mm_empty ();
    1165                 : }
    1166                 : 
    1167                 : static void
    1168               0 : mmx_composite_over_n_0565 (pixman_implementation_t *imp,
    1169                 :                            pixman_op_t              op,
    1170                 :                            pixman_image_t *         src_image,
    1171                 :                            pixman_image_t *         mask_image,
    1172                 :                            pixman_image_t *         dst_image,
    1173                 :                            int32_t                  src_x,
    1174                 :                            int32_t                  src_y,
    1175                 :                            int32_t                  mask_x,
    1176                 :                            int32_t                  mask_y,
    1177                 :                            int32_t                  dest_x,
    1178                 :                            int32_t                  dest_y,
    1179                 :                            int32_t                  width,
    1180                 :                            int32_t                  height)
    1181                 : {
    1182                 :     uint32_t src;
    1183                 :     uint16_t    *dst_line, *dst;
    1184                 :     int32_t w;
    1185                 :     int dst_stride;
    1186                 :     __m64 vsrc, vsrca;
    1187                 : 
    1188                 :     CHECKPOINT ();
    1189                 : 
    1190               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    1191                 : 
    1192               0 :     if (src == 0)
    1193               0 :         return;
    1194                 : 
    1195               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    1196                 : 
    1197               0 :     vsrc = load8888 (src);
    1198               0 :     vsrca = expand_alpha (vsrc);
    1199                 : 
    1200               0 :     while (height--)
    1201                 :     {
    1202               0 :         dst = dst_line;
    1203               0 :         dst_line += dst_stride;
    1204               0 :         w = width;
    1205                 : 
    1206                 :         CHECKPOINT ();
    1207                 : 
    1208               0 :         while (w && (unsigned long)dst & 7)
    1209                 :         {
    1210               0 :             uint64_t d = *dst;
    1211               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    1212                 : 
    1213               0 :             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
    1214               0 :             *dst = to_uint64 (vdest);
    1215                 : 
    1216               0 :             w--;
    1217               0 :             dst++;
    1218                 :         }
    1219                 : 
    1220               0 :         while (w >= 4)
    1221                 :         {
    1222                 :             __m64 vdest;
    1223                 : 
    1224               0 :             vdest = *(__m64 *)dst;
    1225                 : 
    1226               0 :             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
    1227               0 :             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
    1228               0 :             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
    1229               0 :             vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
    1230                 : 
    1231               0 :             *(__m64 *)dst = vdest;
    1232                 : 
    1233               0 :             dst += 4;
    1234               0 :             w -= 4;
    1235                 :         }
    1236                 : 
    1237                 :         CHECKPOINT ();
    1238                 : 
    1239               0 :         while (w)
    1240                 :         {
    1241               0 :             uint64_t d = *dst;
    1242               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    1243                 : 
    1244               0 :             vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
    1245               0 :             *dst = to_uint64 (vdest);
    1246                 : 
    1247               0 :             w--;
    1248               0 :             dst++;
    1249                 :         }
    1250                 :     }
    1251                 : 
    1252                 :     _mm_empty ();
    1253                 : }
    1254                 : 
    1255                 : static void
    1256               0 : mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
    1257                 :                                    pixman_op_t              op,
    1258                 :                                    pixman_image_t *         src_image,
    1259                 :                                    pixman_image_t *         mask_image,
    1260                 :                                    pixman_image_t *         dst_image,
    1261                 :                                    int32_t                  src_x,
    1262                 :                                    int32_t                  src_y,
    1263                 :                                    int32_t                  mask_x,
    1264                 :                                    int32_t                  mask_y,
    1265                 :                                    int32_t                  dest_x,
    1266                 :                                    int32_t                  dest_y,
    1267                 :                                    int32_t                  width,
    1268                 :                                    int32_t                  height)
    1269                 : {
    1270                 :     uint32_t src, srca;
    1271                 :     uint32_t    *dst_line;
    1272                 :     uint32_t    *mask_line;
    1273                 :     int dst_stride, mask_stride;
    1274                 :     __m64 vsrc, vsrca;
    1275                 : 
    1276                 :     CHECKPOINT ();
    1277                 : 
    1278               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    1279                 : 
    1280               0 :     srca = src >> 24;
    1281               0 :     if (src == 0)
    1282               0 :         return;
    1283                 : 
    1284               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1285               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    1286                 : 
    1287               0 :     vsrc = load8888 (src);
    1288               0 :     vsrca = expand_alpha (vsrc);
    1289                 : 
    1290               0 :     while (height--)
    1291                 :     {
    1292               0 :         int twidth = width;
    1293               0 :         uint32_t *p = (uint32_t *)mask_line;
    1294               0 :         uint32_t *q = (uint32_t *)dst_line;
    1295                 : 
    1296               0 :         while (twidth && (unsigned long)q & 7)
    1297                 :         {
    1298               0 :             uint32_t m = *(uint32_t *)p;
    1299                 : 
    1300               0 :             if (m)
    1301                 :             {
    1302               0 :                 __m64 vdest = load8888 (*q);
    1303               0 :                 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
    1304               0 :                 *q = store8888 (vdest);
    1305                 :             }
    1306                 : 
    1307               0 :             twidth--;
    1308               0 :             p++;
    1309               0 :             q++;
    1310                 :         }
    1311                 : 
    1312               0 :         while (twidth >= 2)
    1313                 :         {
    1314                 :             uint32_t m0, m1;
    1315               0 :             m0 = *p;
    1316               0 :             m1 = *(p + 1);
    1317                 : 
    1318               0 :             if (m0 | m1)
    1319                 :             {
    1320                 :                 __m64 dest0, dest1;
    1321               0 :                 __m64 vdest = *(__m64 *)q;
    1322                 : 
    1323               0 :                 dest0 = in_over (vsrc, vsrca, load8888 (m0),
    1324                 :                                  expand8888 (vdest, 0));
    1325               0 :                 dest1 = in_over (vsrc, vsrca, load8888 (m1),
    1326                 :                                  expand8888 (vdest, 1));
    1327                 : 
    1328               0 :                 *(__m64 *)q = pack8888 (dest0, dest1);
    1329                 :             }
    1330                 : 
    1331               0 :             p += 2;
    1332               0 :             q += 2;
    1333               0 :             twidth -= 2;
    1334                 :         }
    1335                 : 
    1336               0 :         while (twidth)
    1337                 :         {
    1338               0 :             uint32_t m = *(uint32_t *)p;
    1339                 : 
    1340               0 :             if (m)
    1341                 :             {
    1342               0 :                 __m64 vdest = load8888 (*q);
    1343               0 :                 vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
    1344               0 :                 *q = store8888 (vdest);
    1345                 :             }
    1346                 : 
    1347               0 :             twidth--;
    1348               0 :             p++;
    1349               0 :             q++;
    1350                 :         }
    1351                 : 
    1352               0 :         dst_line += dst_stride;
    1353               0 :         mask_line += mask_stride;
    1354                 :     }
    1355                 : 
    1356                 :     _mm_empty ();
    1357                 : }
    1358                 : 
    1359                 : static void
    1360               0 : mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
    1361                 :                                 pixman_op_t              op,
    1362                 :                                 pixman_image_t *         src_image,
    1363                 :                                 pixman_image_t *         mask_image,
    1364                 :                                 pixman_image_t *         dst_image,
    1365                 :                                 int32_t                  src_x,
    1366                 :                                 int32_t                  src_y,
    1367                 :                                 int32_t                  mask_x,
    1368                 :                                 int32_t                  mask_y,
    1369                 :                                 int32_t                  dest_x,
    1370                 :                                 int32_t                  dest_y,
    1371                 :                                 int32_t                  width,
    1372                 :                                 int32_t                  height)
    1373                 : {
    1374                 :     uint32_t    *dst_line, *dst;
    1375                 :     uint32_t    *src_line, *src;
    1376                 :     uint32_t mask;
    1377                 :     __m64 vmask;
    1378                 :     int dst_stride, src_stride;
    1379                 :     int32_t w;
    1380                 :     __m64 srca;
    1381                 : 
    1382                 :     CHECKPOINT ();
    1383                 : 
    1384               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1385               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    1386                 : 
    1387               0 :     mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);
    1388               0 :     mask &= 0xff000000;
    1389               0 :     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
    1390               0 :     vmask = load8888 (mask);
    1391               0 :     srca = MC (4x00ff);
    1392                 : 
    1393               0 :     while (height--)
    1394                 :     {
    1395               0 :         dst = dst_line;
    1396               0 :         dst_line += dst_stride;
    1397               0 :         src = src_line;
    1398               0 :         src_line += src_stride;
    1399               0 :         w = width;
    1400                 : 
    1401               0 :         while (w && (unsigned long)dst & 7)
    1402                 :         {
    1403               0 :             __m64 s = load8888 (*src);
    1404               0 :             __m64 d = load8888 (*dst);
    1405                 : 
    1406               0 :             *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
    1407                 : 
    1408               0 :             w--;
    1409               0 :             dst++;
    1410               0 :             src++;
    1411                 :         }
    1412                 : 
    1413               0 :         while (w >= 2)
    1414                 :         {
    1415               0 :             __m64 vs = *(__m64 *)src;
    1416               0 :             __m64 vd = *(__m64 *)dst;
    1417               0 :             __m64 vsrc0 = expand8888 (vs, 0);
    1418               0 :             __m64 vsrc1 = expand8888 (vs, 1);
    1419                 : 
    1420               0 :             *(__m64 *)dst = pack8888 (
    1421                 :                 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
    1422                 :                 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
    1423                 : 
    1424               0 :             w -= 2;
    1425               0 :             dst += 2;
    1426               0 :             src += 2;
    1427                 :         }
    1428                 : 
    1429               0 :         while (w)
    1430                 :         {
    1431               0 :             __m64 s = load8888 (*src);
    1432               0 :             __m64 d = load8888 (*dst);
    1433                 : 
    1434               0 :             *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
    1435                 : 
    1436               0 :             w--;
    1437               0 :             dst++;
    1438               0 :             src++;
    1439                 :         }
    1440                 :     }
    1441                 : 
    1442                 :     _mm_empty ();
    1443               0 : }
    1444                 : 
    1445                 : static void
    1446               0 : mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
    1447                 :                                 pixman_op_t              op,
    1448                 :                                 pixman_image_t *         src_image,
    1449                 :                                 pixman_image_t *         mask_image,
    1450                 :                                 pixman_image_t *         dst_image,
    1451                 :                                 int32_t                  src_x,
    1452                 :                                 int32_t                  src_y,
    1453                 :                                 int32_t                  mask_x,
    1454                 :                                 int32_t                  mask_y,
    1455                 :                                 int32_t                  dest_x,
    1456                 :                                 int32_t                  dest_y,
    1457                 :                                 int32_t                  width,
    1458                 :                                 int32_t                  height)
    1459                 : {
    1460                 :     uint32_t *dst_line, *dst;
    1461                 :     uint32_t *src_line, *src;
    1462                 :     uint32_t mask;
    1463                 :     __m64 vmask;
    1464                 :     int dst_stride, src_stride;
    1465                 :     int32_t w;
    1466                 :     __m64 srca;
    1467                 : 
    1468                 :     CHECKPOINT ();
    1469                 : 
    1470               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1471               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    1472               0 :     mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);
    1473                 : 
    1474               0 :     mask &= 0xff000000;
    1475               0 :     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
    1476               0 :     vmask = load8888 (mask);
    1477               0 :     srca = MC (4x00ff);
    1478                 : 
    1479               0 :     while (height--)
    1480                 :     {
    1481               0 :         dst = dst_line;
    1482               0 :         dst_line += dst_stride;
    1483               0 :         src = src_line;
    1484               0 :         src_line += src_stride;
    1485               0 :         w = width;
    1486                 : 
    1487               0 :         while (w && (unsigned long)dst & 7)
    1488                 :         {
    1489               0 :             __m64 s = load8888 (*src | 0xff000000);
    1490               0 :             __m64 d = load8888 (*dst);
    1491                 : 
    1492               0 :             *dst = store8888 (in_over (s, srca, vmask, d));
    1493                 : 
    1494               0 :             w--;
    1495               0 :             dst++;
    1496               0 :             src++;
    1497                 :         }
    1498                 : 
    1499               0 :         while (w >= 16)
    1500                 :         {
    1501               0 :             __m64 vd0 = *(__m64 *)(dst + 0);
    1502               0 :             __m64 vd1 = *(__m64 *)(dst + 2);
    1503               0 :             __m64 vd2 = *(__m64 *)(dst + 4);
    1504               0 :             __m64 vd3 = *(__m64 *)(dst + 6);
    1505               0 :             __m64 vd4 = *(__m64 *)(dst + 8);
    1506               0 :             __m64 vd5 = *(__m64 *)(dst + 10);
    1507               0 :             __m64 vd6 = *(__m64 *)(dst + 12);
    1508               0 :             __m64 vd7 = *(__m64 *)(dst + 14);
    1509                 : 
    1510               0 :             __m64 vs0 = *(__m64 *)(src + 0);
    1511               0 :             __m64 vs1 = *(__m64 *)(src + 2);
    1512               0 :             __m64 vs2 = *(__m64 *)(src + 4);
    1513               0 :             __m64 vs3 = *(__m64 *)(src + 6);
    1514               0 :             __m64 vs4 = *(__m64 *)(src + 8);
    1515               0 :             __m64 vs5 = *(__m64 *)(src + 10);
    1516               0 :             __m64 vs6 = *(__m64 *)(src + 12);
    1517               0 :             __m64 vs7 = *(__m64 *)(src + 14);
    1518                 : 
    1519               0 :             vd0 = pack8888 (
    1520                 :                 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
    1521                 :                 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
    1522                 : 
    1523               0 :             vd1 = pack8888 (
    1524                 :                 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
    1525                 :                 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
    1526                 : 
    1527               0 :             vd2 = pack8888 (
    1528                 :                 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
    1529                 :                 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
    1530                 : 
    1531               0 :             vd3 = pack8888 (
    1532                 :                 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
    1533                 :                 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
    1534                 : 
    1535               0 :             vd4 = pack8888 (
    1536                 :                 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
    1537                 :                 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
    1538                 : 
    1539               0 :             vd5 = pack8888 (
    1540                 :                 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
    1541                 :                 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
    1542                 : 
    1543               0 :             vd6 = pack8888 (
    1544                 :                 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
    1545                 :                 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
    1546                 : 
    1547               0 :             vd7 = pack8888 (
    1548                 :                 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
    1549                 :                 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
    1550                 : 
    1551               0 :             *(__m64 *)(dst + 0) = vd0;
    1552               0 :             *(__m64 *)(dst + 2) = vd1;
    1553               0 :             *(__m64 *)(dst + 4) = vd2;
    1554               0 :             *(__m64 *)(dst + 6) = vd3;
    1555               0 :             *(__m64 *)(dst + 8) = vd4;
    1556               0 :             *(__m64 *)(dst + 10) = vd5;
    1557               0 :             *(__m64 *)(dst + 12) = vd6;
    1558               0 :             *(__m64 *)(dst + 14) = vd7;
    1559                 : 
    1560               0 :             w -= 16;
    1561               0 :             dst += 16;
    1562               0 :             src += 16;
    1563                 :         }
    1564                 : 
    1565               0 :         while (w)
    1566                 :         {
    1567               0 :             __m64 s = load8888 (*src | 0xff000000);
    1568               0 :             __m64 d = load8888 (*dst);
    1569                 : 
    1570               0 :             *dst = store8888 (in_over (s, srca, vmask, d));
    1571                 : 
    1572               0 :             w--;
    1573               0 :             dst++;
    1574               0 :             src++;
    1575                 :         }
    1576                 :     }
    1577                 : 
    1578                 :     _mm_empty ();
    1579               0 : }
    1580                 : 
    1581                 : static void
    1582               0 : mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
    1583                 :                               pixman_op_t              op,
    1584                 :                               pixman_image_t *         src_image,
    1585                 :                               pixman_image_t *         mask_image,
    1586                 :                               pixman_image_t *         dst_image,
    1587                 :                               int32_t                  src_x,
    1588                 :                               int32_t                  src_y,
    1589                 :                               int32_t                  mask_x,
    1590                 :                               int32_t                  mask_y,
    1591                 :                               int32_t                  dest_x,
    1592                 :                               int32_t                  dest_y,
    1593                 :                               int32_t                  width,
    1594                 :                               int32_t                  height)
    1595                 : {
    1596                 :     uint32_t *dst_line, *dst;
    1597                 :     uint32_t *src_line, *src;
    1598                 :     uint32_t s;
    1599                 :     int dst_stride, src_stride;
    1600                 :     uint8_t a;
    1601                 :     int32_t w;
    1602                 : 
    1603                 :     CHECKPOINT ();
    1604                 : 
    1605               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1606               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    1607                 : 
    1608               0 :     while (height--)
    1609                 :     {
    1610               0 :         dst = dst_line;
    1611               0 :         dst_line += dst_stride;
    1612               0 :         src = src_line;
    1613               0 :         src_line += src_stride;
    1614               0 :         w = width;
    1615                 : 
    1616               0 :         while (w--)
    1617                 :         {
    1618               0 :             s = *src++;
    1619               0 :             a = s >> 24;
    1620                 : 
    1621               0 :             if (a == 0xff)
    1622                 :             {
    1623               0 :                 *dst = s;
    1624                 :             }
    1625               0 :             else if (s)
    1626                 :             {
    1627                 :                 __m64 ms, sa;
    1628               0 :                 ms = load8888 (s);
    1629               0 :                 sa = expand_alpha (ms);
    1630               0 :                 *dst = store8888 (over (ms, sa, load8888 (*dst)));
    1631                 :             }
    1632                 : 
    1633               0 :             dst++;
    1634                 :         }
    1635                 :     }
    1636                 :     _mm_empty ();
    1637               0 : }
    1638                 : 
    1639                 : static void
    1640               0 : mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
    1641                 :                               pixman_op_t              op,
    1642                 :                               pixman_image_t *         src_image,
    1643                 :                               pixman_image_t *         mask_image,
    1644                 :                               pixman_image_t *         dst_image,
    1645                 :                               int32_t                  src_x,
    1646                 :                               int32_t                  src_y,
    1647                 :                               int32_t                  mask_x,
    1648                 :                               int32_t                  mask_y,
    1649                 :                               int32_t                  dest_x,
    1650                 :                               int32_t                  dest_y,
    1651                 :                               int32_t                  width,
    1652                 :                               int32_t                  height)
    1653                 : {
    1654                 :     uint16_t    *dst_line, *dst;
    1655                 :     uint32_t    *src_line, *src;
    1656                 :     int dst_stride, src_stride;
    1657                 :     int32_t w;
    1658                 : 
    1659                 :     CHECKPOINT ();
    1660                 : 
    1661               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    1662               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    1663                 : 
    1664                 : #if 0
    1665                 :     /* FIXME */
    1666                 :     assert (src_image->drawable == mask_image->drawable);
    1667                 : #endif
    1668                 : 
    1669               0 :     while (height--)
    1670                 :     {
    1671               0 :         dst = dst_line;
    1672               0 :         dst_line += dst_stride;
    1673               0 :         src = src_line;
    1674               0 :         src_line += src_stride;
    1675               0 :         w = width;
    1676                 : 
    1677                 :         CHECKPOINT ();
    1678                 : 
    1679               0 :         while (w && (unsigned long)dst & 7)
    1680                 :         {
    1681               0 :             __m64 vsrc = load8888 (*src);
    1682               0 :             uint64_t d = *dst;
    1683               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    1684                 : 
    1685               0 :             vdest = pack_565 (
    1686                 :                 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
    1687                 : 
    1688               0 :             *dst = to_uint64 (vdest);
    1689                 : 
    1690               0 :             w--;
    1691               0 :             dst++;
    1692               0 :             src++;
    1693                 :         }
    1694                 : 
    1695                 :         CHECKPOINT ();
    1696                 : 
    1697               0 :         while (w >= 4)
    1698                 :         {
    1699                 :             __m64 vsrc0, vsrc1, vsrc2, vsrc3;
    1700                 :             __m64 vdest;
    1701                 : 
    1702               0 :             vsrc0 = load8888 (*(src + 0));
    1703               0 :             vsrc1 = load8888 (*(src + 1));
    1704               0 :             vsrc2 = load8888 (*(src + 2));
    1705               0 :             vsrc3 = load8888 (*(src + 3));
    1706                 : 
    1707               0 :             vdest = *(__m64 *)dst;
    1708                 : 
    1709               0 :             vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
    1710               0 :             vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
    1711               0 :             vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
    1712               0 :             vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
    1713                 : 
    1714               0 :             *(__m64 *)dst = vdest;
    1715                 : 
    1716               0 :             w -= 4;
    1717               0 :             dst += 4;
    1718               0 :             src += 4;
    1719                 :         }
    1720                 : 
    1721                 :         CHECKPOINT ();
    1722                 : 
    1723               0 :         while (w)
    1724                 :         {
    1725               0 :             __m64 vsrc = load8888 (*src);
    1726               0 :             uint64_t d = *dst;
    1727               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    1728                 : 
    1729               0 :             vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
    1730                 : 
    1731               0 :             *dst = to_uint64 (vdest);
    1732                 : 
    1733               0 :             w--;
    1734               0 :             dst++;
    1735               0 :             src++;
    1736                 :         }
    1737                 :     }
    1738                 : 
    1739                 :     _mm_empty ();
    1740               0 : }
    1741                 : 
    1742                 : static void
    1743               0 : mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
    1744                 :                              pixman_op_t              op,
    1745                 :                              pixman_image_t *         src_image,
    1746                 :                              pixman_image_t *         mask_image,
    1747                 :                              pixman_image_t *         dst_image,
    1748                 :                              int32_t                  src_x,
    1749                 :                              int32_t                  src_y,
    1750                 :                              int32_t                  mask_x,
    1751                 :                              int32_t                  mask_y,
    1752                 :                              int32_t                  dest_x,
    1753                 :                              int32_t                  dest_y,
    1754                 :                              int32_t                  width,
    1755                 :                              int32_t                  height)
    1756                 : {
    1757                 :     uint32_t src, srca;
    1758                 :     uint32_t *dst_line, *dst;
    1759                 :     uint8_t *mask_line, *mask;
    1760                 :     int dst_stride, mask_stride;
    1761                 :     int32_t w;
    1762                 :     __m64 vsrc, vsrca;
    1763                 :     uint64_t srcsrc;
    1764                 : 
    1765                 :     CHECKPOINT ();
    1766                 : 
    1767               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    1768                 : 
    1769               0 :     srca = src >> 24;
    1770               0 :     if (src == 0)
    1771               0 :         return;
    1772                 : 
    1773               0 :     srcsrc = (uint64_t)src << 32 | src;
    1774                 : 
    1775               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    1776               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    1777                 : 
    1778               0 :     vsrc = load8888 (src);
    1779               0 :     vsrca = expand_alpha (vsrc);
    1780                 : 
    1781               0 :     while (height--)
    1782                 :     {
    1783               0 :         dst = dst_line;
    1784               0 :         dst_line += dst_stride;
    1785               0 :         mask = mask_line;
    1786               0 :         mask_line += mask_stride;
    1787               0 :         w = width;
    1788                 : 
    1789                 :         CHECKPOINT ();
    1790                 : 
    1791               0 :         while (w && (unsigned long)dst & 7)
    1792                 :         {
    1793               0 :             uint64_t m = *mask;
    1794                 : 
    1795               0 :             if (m)
    1796                 :             {
    1797               0 :                 __m64 vdest = in_over (vsrc, vsrca,
    1798                 :                                        expand_alpha_rev (to_m64 (m)),
    1799                 :                                        load8888 (*dst));
    1800                 : 
    1801               0 :                 *dst = store8888 (vdest);
    1802                 :             }
    1803                 : 
    1804               0 :             w--;
    1805               0 :             mask++;
    1806               0 :             dst++;
    1807                 :         }
    1808                 : 
    1809                 :         CHECKPOINT ();
    1810                 : 
    1811               0 :         while (w >= 2)
    1812                 :         {
    1813                 :             uint64_t m0, m1;
    1814                 : 
    1815               0 :             m0 = *mask;
    1816               0 :             m1 = *(mask + 1);
    1817                 : 
    1818               0 :             if (srca == 0xff && (m0 & m1) == 0xff)
    1819                 :             {
    1820               0 :                 *(uint64_t *)dst = srcsrc;
    1821                 :             }
    1822               0 :             else if (m0 | m1)
    1823                 :             {
    1824                 :                 __m64 vdest;
    1825                 :                 __m64 dest0, dest1;
    1826                 : 
    1827               0 :                 vdest = *(__m64 *)dst;
    1828                 : 
    1829               0 :                 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
    1830                 :                                  expand8888 (vdest, 0));
    1831               0 :                 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
    1832                 :                                  expand8888 (vdest, 1));
    1833                 : 
    1834               0 :                 *(__m64 *)dst = pack8888 (dest0, dest1);
    1835                 :             }
    1836                 : 
    1837               0 :             mask += 2;
    1838               0 :             dst += 2;
    1839               0 :             w -= 2;
    1840                 :         }
    1841                 : 
    1842                 :         CHECKPOINT ();
    1843                 : 
    1844               0 :         while (w)
    1845                 :         {
    1846               0 :             uint64_t m = *mask;
    1847                 : 
    1848               0 :             if (m)
    1849                 :             {
    1850               0 :                 __m64 vdest = load8888 (*dst);
    1851                 : 
    1852               0 :                 vdest = in_over (
    1853                 :                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
    1854               0 :                 *dst = store8888 (vdest);
    1855                 :             }
    1856                 : 
    1857               0 :             w--;
    1858               0 :             mask++;
    1859               0 :             dst++;
    1860                 :         }
    1861                 :     }
    1862                 : 
    1863                 :     _mm_empty ();
    1864                 : }
    1865                 : 
    1866                 : pixman_bool_t
    1867               0 : pixman_fill_mmx (uint32_t *bits,
    1868                 :                  int       stride,
    1869                 :                  int       bpp,
    1870                 :                  int       x,
    1871                 :                  int       y,
    1872                 :                  int       width,
    1873                 :                  int       height,
    1874                 :                  uint32_t xor)
    1875                 : {
    1876                 :     uint64_t fill;
    1877                 :     __m64 vfill;
    1878                 :     uint32_t byte_width;
    1879                 :     uint8_t     *byte_line;
    1880                 : 
    1881                 : #ifdef __GNUC__
    1882                 :     __m64 v1, v2, v3, v4, v5, v6, v7;
    1883                 : #endif
    1884                 : 
    1885               0 :     if (bpp != 16 && bpp != 32 && bpp != 8)
    1886               0 :         return FALSE;
    1887                 : 
    1888               0 :     if (bpp == 8)
    1889                 :     {
    1890               0 :         stride = stride * (int) sizeof (uint32_t) / 1;
    1891               0 :         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
    1892               0 :         byte_width = width;
    1893               0 :         stride *= 1;
    1894               0 :         xor = (xor & 0xff) * 0x01010101;
    1895                 :     }
    1896               0 :     else if (bpp == 16)
    1897                 :     {
    1898               0 :         stride = stride * (int) sizeof (uint32_t) / 2;
    1899               0 :         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
    1900               0 :         byte_width = 2 * width;
    1901               0 :         stride *= 2;
    1902               0 :         xor = (xor & 0xffff) * 0x00010001;
    1903                 :     }
    1904                 :     else
    1905                 :     {
    1906               0 :         stride = stride * (int) sizeof (uint32_t) / 4;
    1907               0 :         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
    1908               0 :         byte_width = 4 * width;
    1909               0 :         stride *= 4;
    1910                 :     }
    1911                 : 
    1912               0 :     fill = ((uint64_t)xor << 32) | xor;
    1913               0 :     vfill = to_m64 (fill);
    1914                 : 
    1915                 : #ifdef __GNUC__
    1916               0 :     __asm__ (
    1917                 :         "movq              %7,     %0\n"
    1918                 :         "movq              %7,     %1\n"
    1919                 :         "movq              %7,     %2\n"
    1920                 :         "movq              %7,     %3\n"
    1921                 :         "movq              %7,     %4\n"
    1922                 :         "movq              %7,     %5\n"
    1923                 :         "movq              %7,     %6\n"
    1924                 :         : "=&y" (v1), "=&y" (v2), "=&y" (v3),
    1925                 :           "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
    1926                 :         : "y" (vfill));
    1927                 : #endif
    1928                 : 
    1929               0 :     while (height--)
    1930                 :     {
    1931                 :         int w;
    1932               0 :         uint8_t *d = byte_line;
    1933                 : 
    1934               0 :         byte_line += stride;
    1935               0 :         w = byte_width;
    1936                 : 
    1937               0 :         while (w >= 1 && ((unsigned long)d & 1))
    1938                 :         {
    1939               0 :             *(uint8_t *)d = (xor & 0xff);
    1940               0 :             w--;
    1941               0 :             d++;
    1942                 :         }
    1943                 : 
    1944               0 :         while (w >= 2 && ((unsigned long)d & 3))
    1945                 :         {
    1946               0 :             *(uint16_t *)d = xor;
    1947               0 :             w -= 2;
    1948               0 :             d += 2;
    1949                 :         }
    1950                 : 
    1951               0 :         while (w >= 4 && ((unsigned long)d & 7))
    1952                 :         {
    1953               0 :             *(uint32_t *)d = xor;
    1954                 : 
    1955               0 :             w -= 4;
    1956               0 :             d += 4;
    1957                 :         }
    1958                 : 
    1959               0 :         while (w >= 64)
    1960                 :         {
    1961                 : #ifdef __GNUC__
    1962               0 :             __asm__ (
    1963                 :                 "movq      %1,       (%0)\n"
    1964                 :                 "movq      %2,      8(%0)\n"
    1965                 :                 "movq      %3,     16(%0)\n"
    1966                 :                 "movq      %4,     24(%0)\n"
    1967                 :                 "movq      %5,     32(%0)\n"
    1968                 :                 "movq      %6,     40(%0)\n"
    1969                 :                 "movq      %7,     48(%0)\n"
    1970                 :                 "movq      %8,     56(%0)\n"
    1971                 :                 :
    1972                 :                 : "r" (d),
    1973                 :                   "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
    1974                 :                   "y" (v4), "y" (v5), "y" (v6), "y" (v7)
    1975                 :                 : "memory");
    1976                 : #else
    1977                 :             *(__m64*) (d +  0) = vfill;
    1978                 :             *(__m64*) (d +  8) = vfill;
    1979                 :             *(__m64*) (d + 16) = vfill;
    1980                 :             *(__m64*) (d + 24) = vfill;
    1981                 :             *(__m64*) (d + 32) = vfill;
    1982                 :             *(__m64*) (d + 40) = vfill;
    1983                 :             *(__m64*) (d + 48) = vfill;
    1984                 :             *(__m64*) (d + 56) = vfill;
    1985                 : #endif
    1986               0 :             w -= 64;
    1987               0 :             d += 64;
    1988                 :         }
    1989                 : 
    1990               0 :         while (w >= 4)
    1991                 :         {
    1992               0 :             *(uint32_t *)d = xor;
    1993                 : 
    1994               0 :             w -= 4;
    1995               0 :             d += 4;
    1996                 :         }
    1997               0 :         while (w >= 2)
    1998                 :         {
    1999               0 :             *(uint16_t *)d = xor;
    2000               0 :             w -= 2;
    2001               0 :             d += 2;
    2002                 :         }
    2003               0 :         while (w >= 1)
    2004                 :         {
    2005               0 :             *(uint8_t *)d = (xor & 0xff);
    2006               0 :             w--;
    2007               0 :             d++;
    2008                 :         }
    2009                 : 
    2010                 :     }
    2011                 : 
    2012                 :     _mm_empty ();
    2013               0 :     return TRUE;
    2014                 : }
    2015                 : 
    2016                 : static void
    2017               0 : mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
    2018                 :                             pixman_op_t              op,
    2019                 :                             pixman_image_t *         src_image,
    2020                 :                             pixman_image_t *         mask_image,
    2021                 :                             pixman_image_t *         dst_image,
    2022                 :                             int32_t                  src_x,
    2023                 :                             int32_t                  src_y,
    2024                 :                             int32_t                  mask_x,
    2025                 :                             int32_t                  mask_y,
    2026                 :                             int32_t                  dest_x,
    2027                 :                             int32_t                  dest_y,
    2028                 :                             int32_t                  width,
    2029                 :                             int32_t                  height)
    2030                 : {
    2031                 :     uint32_t src, srca;
    2032                 :     uint32_t    *dst_line, *dst;
    2033                 :     uint8_t     *mask_line, *mask;
    2034                 :     int dst_stride, mask_stride;
    2035                 :     int32_t w;
    2036                 :     __m64 vsrc, vsrca;
    2037                 :     uint64_t srcsrc;
    2038                 : 
    2039                 :     CHECKPOINT ();
    2040                 : 
    2041               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2042                 : 
    2043               0 :     srca = src >> 24;
    2044               0 :     if (src == 0)
    2045                 :     {
    2046               0 :         pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
    2047               0 :                          PIXMAN_FORMAT_BPP (dst_image->bits.format),
    2048                 :                          dest_x, dest_y, width, height, 0);
    2049               0 :         return;
    2050                 :     }
    2051                 : 
    2052               0 :     srcsrc = (uint64_t)src << 32 | src;
    2053                 : 
    2054               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2055               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    2056                 : 
    2057               0 :     vsrc = load8888 (src);
    2058               0 :     vsrca = expand_alpha (vsrc);
    2059                 : 
    2060               0 :     while (height--)
    2061                 :     {
    2062               0 :         dst = dst_line;
    2063               0 :         dst_line += dst_stride;
    2064               0 :         mask = mask_line;
    2065               0 :         mask_line += mask_stride;
    2066               0 :         w = width;
    2067                 : 
    2068                 :         CHECKPOINT ();
    2069                 : 
    2070               0 :         while (w && (unsigned long)dst & 7)
    2071                 :         {
    2072               0 :             uint64_t m = *mask;
    2073                 : 
    2074               0 :             if (m)
    2075                 :             {
    2076               0 :                 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
    2077                 : 
    2078               0 :                 *dst = store8888 (vdest);
    2079                 :             }
    2080                 :             else
    2081                 :             {
    2082               0 :                 *dst = 0;
    2083                 :             }
    2084                 : 
    2085               0 :             w--;
    2086               0 :             mask++;
    2087               0 :             dst++;
    2088                 :         }
    2089                 : 
    2090                 :         CHECKPOINT ();
    2091                 : 
    2092               0 :         while (w >= 2)
    2093                 :         {
    2094                 :             uint64_t m0, m1;
    2095               0 :             m0 = *mask;
    2096               0 :             m1 = *(mask + 1);
    2097                 : 
    2098               0 :             if (srca == 0xff && (m0 & m1) == 0xff)
    2099                 :             {
    2100               0 :                 *(uint64_t *)dst = srcsrc;
    2101                 :             }
    2102               0 :             else if (m0 | m1)
    2103                 :             {
    2104                 :                 __m64 vdest;
    2105                 :                 __m64 dest0, dest1;
    2106                 : 
    2107               0 :                 vdest = *(__m64 *)dst;
    2108                 : 
    2109               0 :                 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
    2110               0 :                 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
    2111                 : 
    2112               0 :                 *(__m64 *)dst = pack8888 (dest0, dest1);
    2113                 :             }
    2114                 :             else
    2115                 :             {
    2116               0 :                 *(uint64_t *)dst = 0;
    2117                 :             }
    2118                 : 
    2119               0 :             mask += 2;
    2120               0 :             dst += 2;
    2121               0 :             w -= 2;
    2122                 :         }
    2123                 : 
    2124                 :         CHECKPOINT ();
    2125                 : 
    2126               0 :         while (w)
    2127                 :         {
    2128               0 :             uint64_t m = *mask;
    2129                 : 
    2130               0 :             if (m)
    2131                 :             {
    2132               0 :                 __m64 vdest = load8888 (*dst);
    2133                 : 
    2134               0 :                 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
    2135               0 :                 *dst = store8888 (vdest);
    2136                 :             }
    2137                 :             else
    2138                 :             {
    2139               0 :                 *dst = 0;
    2140                 :             }
    2141                 : 
    2142               0 :             w--;
    2143               0 :             mask++;
    2144               0 :             dst++;
    2145                 :         }
    2146                 :     }
    2147                 : 
    2148                 :     _mm_empty ();
    2149                 : }
    2150                 : 
    2151                 : static void
    2152               0 : mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
    2153                 :                              pixman_op_t              op,
    2154                 :                              pixman_image_t *         src_image,
    2155                 :                              pixman_image_t *         mask_image,
    2156                 :                              pixman_image_t *         dst_image,
    2157                 :                              int32_t                  src_x,
    2158                 :                              int32_t                  src_y,
    2159                 :                              int32_t                  mask_x,
    2160                 :                              int32_t                  mask_y,
    2161                 :                              int32_t                  dest_x,
    2162                 :                              int32_t                  dest_y,
    2163                 :                              int32_t                  width,
    2164                 :                              int32_t                  height)
    2165                 : {
    2166                 :     uint32_t src, srca;
    2167                 :     uint16_t *dst_line, *dst;
    2168                 :     uint8_t *mask_line, *mask;
    2169                 :     int dst_stride, mask_stride;
    2170                 :     int32_t w;
    2171                 :     __m64 vsrc, vsrca, tmp;
    2172                 :     uint64_t srcsrcsrcsrc, src16;
    2173                 : 
    2174                 :     CHECKPOINT ();
    2175                 : 
    2176               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2177                 : 
    2178               0 :     srca = src >> 24;
    2179               0 :     if (src == 0)
    2180               0 :         return;
    2181                 : 
    2182               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2183               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    2184                 : 
    2185               0 :     vsrc = load8888 (src);
    2186               0 :     vsrca = expand_alpha (vsrc);
    2187                 : 
    2188               0 :     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
    2189               0 :     src16 = to_uint64 (tmp);
    2190                 : 
    2191               0 :     srcsrcsrcsrc =
    2192               0 :         (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
    2193               0 :         (uint64_t)src16 << 16 | (uint64_t)src16;
    2194                 : 
    2195               0 :     while (height--)
    2196                 :     {
    2197               0 :         dst = dst_line;
    2198               0 :         dst_line += dst_stride;
    2199               0 :         mask = mask_line;
    2200               0 :         mask_line += mask_stride;
    2201               0 :         w = width;
    2202                 : 
    2203                 :         CHECKPOINT ();
    2204                 : 
    2205               0 :         while (w && (unsigned long)dst & 7)
    2206                 :         {
    2207               0 :             uint64_t m = *mask;
    2208                 : 
    2209               0 :             if (m)
    2210                 :             {
    2211               0 :                 uint64_t d = *dst;
    2212               0 :                 __m64 vd = to_m64 (d);
    2213               0 :                 __m64 vdest = in_over (
    2214                 :                     vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
    2215                 : 
    2216               0 :                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
    2217               0 :                 *dst = to_uint64 (vd);
    2218                 :             }
    2219                 : 
    2220               0 :             w--;
    2221               0 :             mask++;
    2222               0 :             dst++;
    2223                 :         }
    2224                 : 
    2225                 :         CHECKPOINT ();
    2226                 : 
    2227               0 :         while (w >= 4)
    2228                 :         {
    2229                 :             uint64_t m0, m1, m2, m3;
    2230               0 :             m0 = *mask;
    2231               0 :             m1 = *(mask + 1);
    2232               0 :             m2 = *(mask + 2);
    2233               0 :             m3 = *(mask + 3);
    2234                 : 
    2235               0 :             if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
    2236                 :             {
    2237               0 :                 *(uint64_t *)dst = srcsrcsrcsrc;
    2238                 :             }
    2239               0 :             else if (m0 | m1 | m2 | m3)
    2240                 :             {
    2241                 :                 __m64 vdest;
    2242                 :                 __m64 vm0, vm1, vm2, vm3;
    2243                 : 
    2244               0 :                 vdest = *(__m64 *)dst;
    2245                 : 
    2246               0 :                 vm0 = to_m64 (m0);
    2247               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
    2248                 :                                            expand565 (vdest, 0)), vdest, 0);
    2249               0 :                 vm1 = to_m64 (m1);
    2250               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
    2251                 :                                            expand565 (vdest, 1)), vdest, 1);
    2252               0 :                 vm2 = to_m64 (m2);
    2253               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
    2254                 :                                            expand565 (vdest, 2)), vdest, 2);
    2255               0 :                 vm3 = to_m64 (m3);
    2256               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
    2257                 :                                            expand565 (vdest, 3)), vdest, 3);
    2258                 : 
    2259               0 :                 *(__m64 *)dst = vdest;
    2260                 :             }
    2261                 : 
    2262               0 :             w -= 4;
    2263               0 :             mask += 4;
    2264               0 :             dst += 4;
    2265                 :         }
    2266                 : 
    2267                 :         CHECKPOINT ();
    2268                 : 
    2269               0 :         while (w)
    2270                 :         {
    2271               0 :             uint64_t m = *mask;
    2272                 : 
    2273               0 :             if (m)
    2274                 :             {
    2275               0 :                 uint64_t d = *dst;
    2276               0 :                 __m64 vd = to_m64 (d);
    2277               0 :                 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
    2278                 :                                        expand565 (vd, 0));
    2279               0 :                 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
    2280               0 :                 *dst = to_uint64 (vd);
    2281                 :             }
    2282                 : 
    2283               0 :             w--;
    2284               0 :             mask++;
    2285               0 :             dst++;
    2286                 :         }
    2287                 :     }
    2288                 : 
    2289                 :     _mm_empty ();
    2290                 : }
    2291                 : 
    2292                 : static void
    2293               0 : mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
    2294                 :                                 pixman_op_t              op,
    2295                 :                                 pixman_image_t *         src_image,
    2296                 :                                 pixman_image_t *         mask_image,
    2297                 :                                 pixman_image_t *         dst_image,
    2298                 :                                 int32_t                  src_x,
    2299                 :                                 int32_t                  src_y,
    2300                 :                                 int32_t                  mask_x,
    2301                 :                                 int32_t                  mask_y,
    2302                 :                                 int32_t                  dest_x,
    2303                 :                                 int32_t                  dest_y,
    2304                 :                                 int32_t                  width,
    2305                 :                                 int32_t                  height)
    2306                 : {
    2307                 :     uint16_t    *dst_line, *dst;
    2308                 :     uint32_t    *src_line, *src;
    2309                 :     int dst_stride, src_stride;
    2310                 :     int32_t w;
    2311                 : 
    2312                 :     CHECKPOINT ();
    2313                 : 
    2314               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2315               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2316                 : 
    2317                 : #if 0
    2318                 :     /* FIXME */
    2319                 :     assert (src_image->drawable == mask_image->drawable);
    2320                 : #endif
    2321                 : 
    2322               0 :     while (height--)
    2323                 :     {
    2324               0 :         dst = dst_line;
    2325               0 :         dst_line += dst_stride;
    2326               0 :         src = src_line;
    2327               0 :         src_line += src_stride;
    2328               0 :         w = width;
    2329                 : 
    2330                 :         CHECKPOINT ();
    2331                 : 
    2332               0 :         while (w && (unsigned long)dst & 7)
    2333                 :         {
    2334               0 :             __m64 vsrc = load8888 (*src);
    2335               0 :             uint64_t d = *dst;
    2336               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    2337                 : 
    2338               0 :             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
    2339                 : 
    2340               0 :             *dst = to_uint64 (vdest);
    2341                 : 
    2342               0 :             w--;
    2343               0 :             dst++;
    2344               0 :             src++;
    2345                 :         }
    2346                 : 
    2347                 :         CHECKPOINT ();
    2348                 : 
    2349               0 :         while (w >= 4)
    2350                 :         {
    2351                 :             uint32_t s0, s1, s2, s3;
    2352                 :             unsigned char a0, a1, a2, a3;
    2353                 : 
    2354               0 :             s0 = *src;
    2355               0 :             s1 = *(src + 1);
    2356               0 :             s2 = *(src + 2);
    2357               0 :             s3 = *(src + 3);
    2358                 : 
    2359               0 :             a0 = (s0 >> 24);
    2360               0 :             a1 = (s1 >> 24);
    2361               0 :             a2 = (s2 >> 24);
    2362               0 :             a3 = (s3 >> 24);
    2363                 : 
    2364               0 :             if ((a0 & a1 & a2 & a3) == 0xFF)
    2365                 :             {
    2366                 :                 __m64 vdest;
    2367               0 :                 vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
    2368               0 :                 vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
    2369               0 :                 vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
    2370               0 :                 vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
    2371                 : 
    2372               0 :                 *(__m64 *)dst = vdest;
    2373                 :             }
    2374               0 :             else if (s0 | s1 | s2 | s3)
    2375                 :             {
    2376               0 :                 __m64 vdest = *(__m64 *)dst;
    2377                 : 
    2378               0 :                 vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
    2379               0 :                 vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
    2380               0 :                 vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
    2381               0 :                 vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
    2382                 : 
    2383               0 :                 *(__m64 *)dst = vdest;
    2384                 :             }
    2385                 : 
    2386               0 :             w -= 4;
    2387               0 :             dst += 4;
    2388               0 :             src += 4;
    2389                 :         }
    2390                 : 
    2391                 :         CHECKPOINT ();
    2392                 : 
    2393               0 :         while (w)
    2394                 :         {
    2395               0 :             __m64 vsrc = load8888 (*src);
    2396               0 :             uint64_t d = *dst;
    2397               0 :             __m64 vdest = expand565 (to_m64 (d), 0);
    2398                 : 
    2399               0 :             vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
    2400                 : 
    2401               0 :             *dst = to_uint64 (vdest);
    2402                 : 
    2403               0 :             w--;
    2404               0 :             dst++;
    2405               0 :             src++;
    2406                 :         }
    2407                 :     }
    2408                 : 
    2409                 :     _mm_empty ();
    2410               0 : }
    2411                 : 
    2412                 : static void
    2413               0 : mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
    2414                 :                                 pixman_op_t              op,
    2415                 :                                 pixman_image_t *         src_image,
    2416                 :                                 pixman_image_t *         mask_image,
    2417                 :                                 pixman_image_t *         dst_image,
    2418                 :                                 int32_t                  src_x,
    2419                 :                                 int32_t                  src_y,
    2420                 :                                 int32_t                  mask_x,
    2421                 :                                 int32_t                  mask_y,
    2422                 :                                 int32_t                  dest_x,
    2423                 :                                 int32_t                  dest_y,
    2424                 :                                 int32_t                  width,
    2425                 :                                 int32_t                  height)
    2426                 : {
    2427                 :     uint32_t    *dst_line, *dst;
    2428                 :     uint32_t    *src_line, *src;
    2429                 :     int dst_stride, src_stride;
    2430                 :     int32_t w;
    2431                 : 
    2432                 :     CHECKPOINT ();
    2433                 : 
    2434               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2435               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2436                 : 
    2437                 : #if 0
    2438                 :     /* FIXME */
    2439                 :     assert (src_image->drawable == mask_image->drawable);
    2440                 : #endif
    2441                 : 
    2442               0 :     while (height--)
    2443                 :     {
    2444               0 :         dst = dst_line;
    2445               0 :         dst_line += dst_stride;
    2446               0 :         src = src_line;
    2447               0 :         src_line += src_stride;
    2448               0 :         w = width;
    2449                 : 
    2450               0 :         while (w && (unsigned long)dst & 7)
    2451                 :         {
    2452               0 :             __m64 s = load8888 (*src);
    2453               0 :             __m64 d = load8888 (*dst);
    2454                 : 
    2455               0 :             *dst = store8888 (over_rev_non_pre (s, d));
    2456                 : 
    2457               0 :             w--;
    2458               0 :             dst++;
    2459               0 :             src++;
    2460                 :         }
    2461                 : 
    2462               0 :         while (w >= 2)
    2463                 :         {
    2464                 :             uint64_t s0, s1;
    2465                 :             unsigned char a0, a1;
    2466                 :             __m64 d0, d1;
    2467                 : 
    2468               0 :             s0 = *src;
    2469               0 :             s1 = *(src + 1);
    2470                 : 
    2471               0 :             a0 = (s0 >> 24);
    2472               0 :             a1 = (s1 >> 24);
    2473                 : 
    2474               0 :             if ((a0 & a1) == 0xFF)
    2475                 :             {
    2476               0 :                 d0 = invert_colors (load8888 (s0));
    2477               0 :                 d1 = invert_colors (load8888 (s1));
    2478                 : 
    2479               0 :                 *(__m64 *)dst = pack8888 (d0, d1);
    2480                 :             }
    2481               0 :             else if (s0 | s1)
    2482                 :             {
    2483               0 :                 __m64 vdest = *(__m64 *)dst;
    2484                 : 
    2485               0 :                 d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
    2486               0 :                 d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
    2487                 : 
    2488               0 :                 *(__m64 *)dst = pack8888 (d0, d1);
    2489                 :             }
    2490                 : 
    2491               0 :             w -= 2;
    2492               0 :             dst += 2;
    2493               0 :             src += 2;
    2494                 :         }
    2495                 : 
    2496               0 :         while (w)
    2497                 :         {
    2498               0 :             __m64 s = load8888 (*src);
    2499               0 :             __m64 d = load8888 (*dst);
    2500                 : 
    2501               0 :             *dst = store8888 (over_rev_non_pre (s, d));
    2502                 : 
    2503               0 :             w--;
    2504               0 :             dst++;
    2505               0 :             src++;
    2506                 :         }
    2507                 :     }
    2508                 : 
    2509                 :     _mm_empty ();
    2510               0 : }
    2511                 : 
    2512                 : static void
    2513               0 : mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
    2514                 :                                    pixman_op_t              op,
    2515                 :                                    pixman_image_t *         src_image,
    2516                 :                                    pixman_image_t *         mask_image,
    2517                 :                                    pixman_image_t *         dst_image,
    2518                 :                                    int32_t                  src_x,
    2519                 :                                    int32_t                  src_y,
    2520                 :                                    int32_t                  mask_x,
    2521                 :                                    int32_t                  mask_y,
    2522                 :                                    int32_t                  dest_x,
    2523                 :                                    int32_t                  dest_y,
    2524                 :                                    int32_t                  width,
    2525                 :                                    int32_t                  height)
    2526                 : {
    2527                 :     uint32_t src, srca;
    2528                 :     uint16_t    *dst_line;
    2529                 :     uint32_t    *mask_line;
    2530                 :     int dst_stride, mask_stride;
    2531                 :     __m64 vsrc, vsrca;
    2532                 : 
    2533                 :     CHECKPOINT ();
    2534                 : 
    2535               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2536                 : 
    2537               0 :     srca = src >> 24;
    2538               0 :     if (src == 0)
    2539               0 :         return;
    2540                 : 
    2541               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2542               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    2543                 : 
    2544               0 :     vsrc = load8888 (src);
    2545               0 :     vsrca = expand_alpha (vsrc);
    2546                 : 
    2547               0 :     while (height--)
    2548                 :     {
    2549               0 :         int twidth = width;
    2550               0 :         uint32_t *p = (uint32_t *)mask_line;
    2551               0 :         uint16_t *q = (uint16_t *)dst_line;
    2552                 : 
    2553               0 :         while (twidth && ((unsigned long)q & 7))
    2554                 :         {
    2555               0 :             uint32_t m = *(uint32_t *)p;
    2556                 : 
    2557               0 :             if (m)
    2558                 :             {
    2559               0 :                 uint64_t d = *q;
    2560               0 :                 __m64 vdest = expand565 (to_m64 (d), 0);
    2561               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
    2562               0 :                 *q = to_uint64 (vdest);
    2563                 :             }
    2564                 : 
    2565               0 :             twidth--;
    2566               0 :             p++;
    2567               0 :             q++;
    2568                 :         }
    2569                 : 
    2570               0 :         while (twidth >= 4)
    2571                 :         {
    2572                 :             uint32_t m0, m1, m2, m3;
    2573                 : 
    2574               0 :             m0 = *p;
    2575               0 :             m1 = *(p + 1);
    2576               0 :             m2 = *(p + 2);
    2577               0 :             m3 = *(p + 3);
    2578                 : 
    2579               0 :             if ((m0 | m1 | m2 | m3))
    2580                 :             {
    2581               0 :                 __m64 vdest = *(__m64 *)q;
    2582                 : 
    2583               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
    2584               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
    2585               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
    2586               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
    2587                 : 
    2588               0 :                 *(__m64 *)q = vdest;
    2589                 :             }
    2590               0 :             twidth -= 4;
    2591               0 :             p += 4;
    2592               0 :             q += 4;
    2593                 :         }
    2594                 : 
    2595               0 :         while (twidth)
    2596                 :         {
    2597                 :             uint32_t m;
    2598                 : 
    2599               0 :             m = *(uint32_t *)p;
    2600               0 :             if (m)
    2601                 :             {
    2602               0 :                 uint64_t d = *q;
    2603               0 :                 __m64 vdest = expand565 (to_m64 (d), 0);
    2604               0 :                 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
    2605               0 :                 *q = to_uint64 (vdest);
    2606                 :             }
    2607                 : 
    2608               0 :             twidth--;
    2609               0 :             p++;
    2610               0 :             q++;
    2611                 :         }
    2612                 : 
    2613               0 :         mask_line += mask_stride;
    2614               0 :         dst_line += dst_stride;
    2615                 :     }
    2616                 : 
    2617                 :     _mm_empty ();
    2618                 : }
    2619                 : 
    2620                 : static void
    2621               0 : mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
    2622                 :                         pixman_op_t              op,
    2623                 :                         pixman_image_t *         src_image,
    2624                 :                         pixman_image_t *         mask_image,
    2625                 :                         pixman_image_t *         dst_image,
    2626                 :                         int32_t                  src_x,
    2627                 :                         int32_t                  src_y,
    2628                 :                         int32_t                  mask_x,
    2629                 :                         int32_t                  mask_y,
    2630                 :                         int32_t                  dest_x,
    2631                 :                         int32_t                  dest_y,
    2632                 :                         int32_t                  width,
    2633                 :                         int32_t                  height)
    2634                 : {
    2635                 :     uint8_t *dst_line, *dst;
    2636                 :     uint8_t *mask_line, *mask;
    2637                 :     int dst_stride, mask_stride;
    2638                 :     int32_t w;
    2639                 :     uint32_t src;
    2640                 :     uint8_t sa;
    2641                 :     __m64 vsrc, vsrca;
    2642                 : 
    2643               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    2644               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    2645                 : 
    2646               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2647                 : 
    2648               0 :     sa = src >> 24;
    2649                 : 
    2650               0 :     vsrc = load8888 (src);
    2651               0 :     vsrca = expand_alpha (vsrc);
    2652                 : 
    2653               0 :     while (height--)
    2654                 :     {
    2655               0 :         dst = dst_line;
    2656               0 :         dst_line += dst_stride;
    2657               0 :         mask = mask_line;
    2658               0 :         mask_line += mask_stride;
    2659               0 :         w = width;
    2660                 : 
    2661               0 :         if ((((unsigned long)dst_image & 3) == 0) &&
    2662               0 :             (((unsigned long)src_image & 3) == 0))
    2663                 :         {
    2664               0 :             while (w >= 4)
    2665                 :             {
    2666                 :                 uint32_t m;
    2667                 :                 __m64 vmask;
    2668                 :                 __m64 vdest;
    2669                 : 
    2670               0 :                 m = 0;
    2671                 : 
    2672               0 :                 vmask = load8888 (*(uint32_t *)mask);
    2673               0 :                 vdest = load8888 (*(uint32_t *)dst);
    2674                 : 
    2675               0 :                 *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
    2676                 : 
    2677               0 :                 dst += 4;
    2678               0 :                 mask += 4;
    2679               0 :                 w -= 4;
    2680                 :             }
    2681                 :         }
    2682                 : 
    2683               0 :         while (w--)
    2684                 :         {
    2685                 :             uint16_t tmp;
    2686                 :             uint8_t a;
    2687                 :             uint32_t m, d;
    2688                 : 
    2689               0 :             a = *mask++;
    2690               0 :             d = *dst;
    2691                 : 
    2692               0 :             m = MUL_UN8 (sa, a, tmp);
    2693               0 :             d = MUL_UN8 (m, d, tmp);
    2694                 : 
    2695               0 :             *dst++ = d;
    2696                 :         }
    2697                 :     }
    2698                 : 
    2699                 :     _mm_empty ();
    2700               0 : }
    2701                 : 
    2702                 : static void
    2703               0 : mmx_composite_in_8_8 (pixman_implementation_t *imp,
    2704                 :                       pixman_op_t              op,
    2705                 :                       pixman_image_t *         src_image,
    2706                 :                       pixman_image_t *         mask_image,
    2707                 :                       pixman_image_t *         dst_image,
    2708                 :                       int32_t                  src_x,
    2709                 :                       int32_t                  src_y,
    2710                 :                       int32_t                  mask_x,
    2711                 :                       int32_t                  mask_y,
    2712                 :                       int32_t                  dest_x,
    2713                 :                       int32_t                  dest_y,
    2714                 :                       int32_t                  width,
    2715                 :                       int32_t                  height)
    2716                 : {
    2717                 :     uint8_t     *dst_line, *dst;
    2718                 :     uint8_t     *src_line, *src;
    2719                 :     int src_stride, dst_stride;
    2720                 :     int32_t w;
    2721                 : 
    2722               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    2723               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    2724                 : 
    2725               0 :     while (height--)
    2726                 :     {
    2727               0 :         dst = dst_line;
    2728               0 :         dst_line += dst_stride;
    2729               0 :         src = src_line;
    2730               0 :         src_line += src_stride;
    2731               0 :         w = width;
    2732                 : 
    2733               0 :         if ((((unsigned long)dst_image & 3) == 0) &&
    2734               0 :             (((unsigned long)src_image & 3) == 0))
    2735                 :         {
    2736               0 :             while (w >= 4)
    2737                 :             {
    2738               0 :                 uint32_t *s = (uint32_t *)src;
    2739               0 :                 uint32_t *d = (uint32_t *)dst;
    2740                 : 
    2741               0 :                 *d = store8888 (in (load8888 (*s), load8888 (*d)));
    2742                 : 
    2743               0 :                 w -= 4;
    2744               0 :                 dst += 4;
    2745               0 :                 src += 4;
    2746                 :             }
    2747                 :         }
    2748                 : 
    2749               0 :         while (w--)
    2750                 :         {
    2751                 :             uint8_t s, d;
    2752                 :             uint16_t tmp;
    2753                 : 
    2754               0 :             s = *src;
    2755               0 :             d = *dst;
    2756                 : 
    2757               0 :             *dst = MUL_UN8 (s, d, tmp);
    2758                 : 
    2759               0 :             src++;
    2760               0 :             dst++;
    2761                 :         }
    2762                 :     }
    2763                 : 
    2764                 :     _mm_empty ();
    2765               0 : }
    2766                 : 
    2767                 : static void
    2768               0 : mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
    2769                 :                          pixman_op_t              op,
    2770                 :                          pixman_image_t *         src_image,
    2771                 :                          pixman_image_t *         mask_image,
    2772                 :                          pixman_image_t *         dst_image,
    2773                 :                          int32_t                  src_x,
    2774                 :                          int32_t                  src_y,
    2775                 :                          int32_t                  mask_x,
    2776                 :                          int32_t                  mask_y,
    2777                 :                          int32_t                  dest_x,
    2778                 :                          int32_t                  dest_y,
    2779                 :                          int32_t                  width,
    2780                 :                          int32_t                  height)
    2781                 : {
    2782                 :     uint8_t     *dst_line, *dst;
    2783                 :     uint8_t     *mask_line, *mask;
    2784                 :     int dst_stride, mask_stride;
    2785                 :     int32_t w;
    2786                 :     uint32_t src;
    2787                 :     uint8_t sa;
    2788                 :     __m64 vsrc, vsrca;
    2789                 : 
    2790               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    2791               0 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    2792                 : 
    2793               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2794                 : 
    2795               0 :     sa = src >> 24;
    2796                 : 
    2797               0 :     if (src == 0)
    2798               0 :         return;
    2799                 : 
    2800               0 :     vsrc = load8888 (src);
    2801               0 :     vsrca = expand_alpha (vsrc);
    2802                 : 
    2803               0 :     while (height--)
    2804                 :     {
    2805               0 :         dst = dst_line;
    2806               0 :         dst_line += dst_stride;
    2807               0 :         mask = mask_line;
    2808               0 :         mask_line += mask_stride;
    2809               0 :         w = width;
    2810                 : 
    2811               0 :         if ((((unsigned long)mask_image & 3) == 0) &&
    2812               0 :             (((unsigned long)dst_image  & 3) == 0))
    2813                 :         {
    2814               0 :             while (w >= 4)
    2815                 :             {
    2816               0 :                 __m64 vmask = load8888 (*(uint32_t *)mask);
    2817               0 :                 __m64 vdest = load8888 (*(uint32_t *)dst);
    2818                 : 
    2819               0 :                 *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
    2820                 : 
    2821               0 :                 w -= 4;
    2822               0 :                 dst += 4;
    2823               0 :                 mask += 4;
    2824                 :             }
    2825                 :         }
    2826                 : 
    2827               0 :         while (w--)
    2828                 :         {
    2829                 :             uint16_t tmp;
    2830                 :             uint16_t a;
    2831                 :             uint32_t m, d;
    2832                 :             uint32_t r;
    2833                 : 
    2834               0 :             a = *mask++;
    2835               0 :             d = *dst;
    2836                 : 
    2837               0 :             m = MUL_UN8 (sa, a, tmp);
    2838               0 :             r = ADD_UN8 (m, d, tmp);
    2839                 : 
    2840               0 :             *dst++ = r;
    2841                 :         }
    2842                 :     }
    2843                 : 
    2844                 :     _mm_empty ();
    2845                 : }
    2846                 : 
    2847                 : static void
    2848               0 : mmx_composite_add_8_8 (pixman_implementation_t *imp,
    2849                 :                        pixman_op_t              op,
    2850                 :                        pixman_image_t *         src_image,
    2851                 :                        pixman_image_t *         mask_image,
    2852                 :                        pixman_image_t *         dst_image,
    2853                 :                        int32_t                  src_x,
    2854                 :                        int32_t                  src_y,
    2855                 :                        int32_t                  mask_x,
    2856                 :                        int32_t                  mask_y,
    2857                 :                        int32_t                  dest_x,
    2858                 :                        int32_t                  dest_y,
    2859                 :                        int32_t                  width,
    2860                 :                        int32_t                  height)
    2861                 : {
    2862                 :     uint8_t *dst_line, *dst;
    2863                 :     uint8_t *src_line, *src;
    2864                 :     int dst_stride, src_stride;
    2865                 :     int32_t w;
    2866                 :     uint8_t s, d;
    2867                 :     uint16_t t;
    2868                 : 
    2869                 :     CHECKPOINT ();
    2870                 : 
    2871               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    2872               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    2873                 : 
    2874               0 :     while (height--)
    2875                 :     {
    2876               0 :         dst = dst_line;
    2877               0 :         dst_line += dst_stride;
    2878               0 :         src = src_line;
    2879               0 :         src_line += src_stride;
    2880               0 :         w = width;
    2881                 : 
    2882               0 :         while (w && (unsigned long)dst & 7)
    2883                 :         {
    2884               0 :             s = *src;
    2885               0 :             d = *dst;
    2886               0 :             t = d + s;
    2887               0 :             s = t | (0 - (t >> 8));
    2888               0 :             *dst = s;
    2889                 : 
    2890               0 :             dst++;
    2891               0 :             src++;
    2892               0 :             w--;
    2893                 :         }
    2894                 : 
    2895               0 :         while (w >= 8)
    2896                 :         {
    2897               0 :             *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
    2898               0 :             dst += 8;
    2899               0 :             src += 8;
    2900               0 :             w -= 8;
    2901                 :         }
    2902                 : 
    2903               0 :         while (w)
    2904                 :         {
    2905               0 :             s = *src;
    2906               0 :             d = *dst;
    2907               0 :             t = d + s;
    2908               0 :             s = t | (0 - (t >> 8));
    2909               0 :             *dst = s;
    2910                 : 
    2911               0 :             dst++;
    2912               0 :             src++;
    2913               0 :             w--;
    2914                 :         }
    2915                 :     }
    2916                 : 
    2917                 :     _mm_empty ();
    2918               0 : }
    2919                 : 
    2920                 : static void
    2921               0 : mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
    2922                 :                              pixman_op_t              op,
    2923                 :                              pixman_image_t *         src_image,
    2924                 :                              pixman_image_t *         mask_image,
    2925                 :                              pixman_image_t *         dst_image,
    2926                 :                              int32_t                  src_x,
    2927                 :                              int32_t                  src_y,
    2928                 :                              int32_t                  mask_x,
    2929                 :                              int32_t                  mask_y,
    2930                 :                              int32_t                  dest_x,
    2931                 :                              int32_t                  dest_y,
    2932                 :                              int32_t                  width,
    2933                 :                              int32_t                  height)
    2934                 : {
    2935                 :     __m64 dst64;
    2936                 :     uint32_t    *dst_line, *dst;
    2937                 :     uint32_t    *src_line, *src;
    2938                 :     int dst_stride, src_stride;
    2939                 :     int32_t w;
    2940                 : 
    2941                 :     CHECKPOINT ();
    2942                 : 
    2943               0 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2944               0 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2945                 : 
    2946               0 :     while (height--)
    2947                 :     {
    2948               0 :         dst = dst_line;
    2949               0 :         dst_line += dst_stride;
    2950               0 :         src = src_line;
    2951               0 :         src_line += src_stride;
    2952               0 :         w = width;
    2953                 : 
    2954               0 :         while (w && (unsigned long)dst & 7)
    2955                 :         {
    2956               0 :             *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
    2957               0 :                                                    _mm_cvtsi32_si64 (*dst)));
    2958               0 :             dst++;
    2959               0 :             src++;
    2960               0 :             w--;
    2961                 :         }
    2962                 : 
    2963               0 :         while (w >= 2)
    2964                 :         {
    2965               0 :             dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
    2966               0 :             *(uint64_t*)dst = to_uint64 (dst64);
    2967               0 :             dst += 2;
    2968               0 :             src += 2;
    2969               0 :             w -= 2;
    2970                 :         }
    2971                 : 
    2972               0 :         if (w)
    2973                 :         {
    2974               0 :             *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
    2975               0 :                                                    _mm_cvtsi32_si64 (*dst)));
    2976                 : 
    2977                 :         }
    2978                 :     }
    2979                 : 
    2980                 :     _mm_empty ();
    2981               0 : }
    2982                 : 
    2983                 : static pixman_bool_t
    2984               0 : pixman_blt_mmx (uint32_t *src_bits,
    2985                 :                 uint32_t *dst_bits,
    2986                 :                 int       src_stride,
    2987                 :                 int       dst_stride,
    2988                 :                 int       src_bpp,
    2989                 :                 int       dst_bpp,
    2990                 :                 int       src_x,
    2991                 :                 int       src_y,
    2992                 :                 int       dst_x,
    2993                 :                 int       dst_y,
    2994                 :                 int       width,
    2995                 :                 int       height)
    2996                 : {
    2997                 :     uint8_t *   src_bytes;
    2998                 :     uint8_t *   dst_bytes;
    2999                 :     int byte_width;
    3000                 : 
    3001               0 :     if (src_bpp != dst_bpp)
    3002               0 :         return FALSE;
    3003                 : 
    3004               0 :     if (src_bpp == 16)
    3005                 :     {
    3006               0 :         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
    3007               0 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
    3008               0 :         src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
    3009               0 :         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
    3010               0 :         byte_width = 2 * width;
    3011               0 :         src_stride *= 2;
    3012               0 :         dst_stride *= 2;
    3013                 :     }
    3014               0 :     else if (src_bpp == 32)
    3015                 :     {
    3016               0 :         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
    3017               0 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
    3018               0 :         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
    3019               0 :         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
    3020               0 :         byte_width = 4 * width;
    3021               0 :         src_stride *= 4;
    3022               0 :         dst_stride *= 4;
    3023                 :     }
    3024                 :     else
    3025                 :     {
    3026               0 :         return FALSE;
    3027                 :     }
    3028                 : 
    3029               0 :     while (height--)
    3030                 :     {
    3031                 :         int w;
    3032               0 :         uint8_t *s = src_bytes;
    3033               0 :         uint8_t *d = dst_bytes;
    3034               0 :         src_bytes += src_stride;
    3035               0 :         dst_bytes += dst_stride;
    3036               0 :         w = byte_width;
    3037                 : 
    3038               0 :         while (w >= 2 && ((unsigned long)d & 3))
    3039                 :         {
    3040               0 :             *(uint16_t *)d = *(uint16_t *)s;
    3041               0 :             w -= 2;
    3042               0 :             s += 2;
    3043               0 :             d += 2;
    3044                 :         }
    3045                 : 
    3046               0 :         while (w >= 4 && ((unsigned long)d & 7))
    3047                 :         {
    3048               0 :             *(uint32_t *)d = *(uint32_t *)s;
    3049                 : 
    3050               0 :             w -= 4;
    3051               0 :             s += 4;
    3052               0 :             d += 4;
    3053                 :         }
    3054                 : 
    3055               0 :         while (w >= 64)
    3056                 :         {
    3057                 : #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
    3058               0 :             __asm__ (
    3059                 :                 "movq        (%1),   %%mm0\n"
    3060                 :                 "movq       8(%1),   %%mm1\n"
    3061                 :                 "movq      16(%1),   %%mm2\n"
    3062                 :                 "movq      24(%1),   %%mm3\n"
    3063                 :                 "movq      32(%1),   %%mm4\n"
    3064                 :                 "movq      40(%1),   %%mm5\n"
    3065                 :                 "movq      48(%1),   %%mm6\n"
    3066                 :                 "movq      56(%1),   %%mm7\n"
    3067                 : 
    3068                 :                 "movq      %%mm0,    (%0)\n"
    3069                 :                 "movq      %%mm1,   8(%0)\n"
    3070                 :                 "movq      %%mm2,  16(%0)\n"
    3071                 :                 "movq      %%mm3,  24(%0)\n"
    3072                 :                 "movq      %%mm4,  32(%0)\n"
    3073                 :                 "movq      %%mm5,  40(%0)\n"
    3074                 :                 "movq      %%mm6,  48(%0)\n"
    3075                 :                 "movq      %%mm7,  56(%0)\n"
    3076                 :                 :
    3077                 :                 : "r" (d), "r" (s)
    3078                 :                 : "memory",
    3079                 :                   "%mm0", "%mm1", "%mm2", "%mm3",
    3080                 :                   "%mm4", "%mm5", "%mm6", "%mm7");
    3081                 : #else
    3082                 :             __m64 v0 = *(__m64 *)(s + 0);
    3083                 :             __m64 v1 = *(__m64 *)(s + 8);
    3084                 :             __m64 v2 = *(__m64 *)(s + 16);
    3085                 :             __m64 v3 = *(__m64 *)(s + 24);
    3086                 :             __m64 v4 = *(__m64 *)(s + 32);
    3087                 :             __m64 v5 = *(__m64 *)(s + 40);
    3088                 :             __m64 v6 = *(__m64 *)(s + 48);
    3089                 :             __m64 v7 = *(__m64 *)(s + 56);
    3090                 :             *(__m64 *)(d + 0)  = v0;
    3091                 :             *(__m64 *)(d + 8)  = v1;
    3092                 :             *(__m64 *)(d + 16) = v2;
    3093                 :             *(__m64 *)(d + 24) = v3;
    3094                 :             *(__m64 *)(d + 32) = v4;
    3095                 :             *(__m64 *)(d + 40) = v5;
    3096                 :             *(__m64 *)(d + 48) = v6;
    3097                 :             *(__m64 *)(d + 56) = v7;
    3098                 : #endif
    3099                 : 
    3100               0 :             w -= 64;
    3101               0 :             s += 64;
    3102               0 :             d += 64;
    3103                 :         }
    3104               0 :         while (w >= 4)
    3105                 :         {
    3106               0 :             *(uint32_t *)d = *(uint32_t *)s;
    3107                 : 
    3108               0 :             w -= 4;
    3109               0 :             s += 4;
    3110               0 :             d += 4;
    3111                 :         }
    3112               0 :         if (w >= 2)
    3113                 :         {
    3114               0 :             *(uint16_t *)d = *(uint16_t *)s;
    3115               0 :             w -= 2;
    3116               0 :             s += 2;
    3117               0 :             d += 2;
    3118                 :         }
    3119                 :     }
    3120                 : 
    3121                 :     _mm_empty ();
    3122                 : 
    3123               0 :     return TRUE;
    3124                 : }
    3125                 : 
    3126                 : static void
    3127               0 : mmx_composite_copy_area (pixman_implementation_t *imp,
    3128                 :                          pixman_op_t              op,
    3129                 :                          pixman_image_t *         src_image,
    3130                 :                          pixman_image_t *         mask_image,
    3131                 :                          pixman_image_t *         dst_image,
    3132                 :                          int32_t                  src_x,
    3133                 :                          int32_t                  src_y,
    3134                 :                          int32_t                  mask_x,
    3135                 :                          int32_t                  mask_y,
    3136                 :                          int32_t                  dest_x,
    3137                 :                          int32_t                  dest_y,
    3138                 :                          int32_t                  width,
    3139                 :                          int32_t                  height)
    3140                 : {
    3141               0 :     pixman_blt_mmx (src_image->bits.bits,
    3142                 :                     dst_image->bits.bits,
    3143                 :                     src_image->bits.rowstride,
    3144                 :                     dst_image->bits.rowstride,
    3145               0 :                     PIXMAN_FORMAT_BPP (src_image->bits.format),
    3146               0 :                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
    3147                 :                     src_x, src_y, dest_x, dest_y, width, height);
    3148               0 : }
    3149                 : 
    3150                 : #if 0
    3151                 : static void
    3152                 : mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
    3153                 :                                 pixman_op_t              op,
    3154                 :                                 pixman_image_t *         src_image,
    3155                 :                                 pixman_image_t *         mask_image,
    3156                 :                                 pixman_image_t *         dst_image,
    3157                 :                                 int32_t                  src_x,
    3158                 :                                 int32_t                  src_y,
    3159                 :                                 int32_t                  mask_x,
    3160                 :                                 int32_t                  mask_y,
    3161                 :                                 int32_t                  dest_x,
    3162                 :                                 int32_t                  dest_y,
    3163                 :                                 int32_t                  width,
    3164                 :                                 int32_t                  height)
    3165                 : {
    3166                 :     uint32_t  *src, *src_line;
    3167                 :     uint32_t  *dst, *dst_line;
    3168                 :     uint8_t  *mask, *mask_line;
    3169                 :     int src_stride, mask_stride, dst_stride;
    3170                 :     int32_t w;
    3171                 : 
    3172                 :     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3173                 :     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3174                 :     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3175                 : 
    3176                 :     while (height--)
    3177                 :     {
    3178                 :         src = src_line;
    3179                 :         src_line += src_stride;
    3180                 :         dst = dst_line;
    3181                 :         dst_line += dst_stride;
    3182                 :         mask = mask_line;
    3183                 :         mask_line += mask_stride;
    3184                 : 
    3185                 :         w = width;
    3186                 : 
    3187                 :         while (w--)
    3188                 :         {
    3189                 :             uint64_t m = *mask;
    3190                 : 
    3191                 :             if (m)
    3192                 :             {
    3193                 :                 __m64 s = load8888 (*src | 0xff000000);
    3194                 : 
    3195                 :                 if (m == 0xff)
    3196                 :                 {
    3197                 :                     *dst = store8888 (s);
    3198                 :                 }
    3199                 :                 else
    3200                 :                 {
    3201                 :                     __m64 sa = expand_alpha (s);
    3202                 :                     __m64 vm = expand_alpha_rev (to_m64 (m));
    3203                 :                     __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
    3204                 : 
    3205                 :                     *dst = store8888 (vdest);
    3206                 :                 }
    3207                 :             }
    3208                 : 
    3209                 :             mask++;
    3210                 :             dst++;
    3211                 :             src++;
    3212                 :         }
    3213                 :     }
    3214                 : 
    3215                 :     _mm_empty ();
    3216                 : }
    3217                 : #endif
    3218                 : 
    3219                 : static const pixman_fast_path_t mmx_fast_paths[] =
    3220                 : {
    3221                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
    3222                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
    3223                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
    3224                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
    3225                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
    3226                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
    3227                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
    3228                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
    3229                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
    3230                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
    3231                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
    3232                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
    3233                 :     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
    3234                 :     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
    3235                 :     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
    3236                 :     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
    3237                 :     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
    3238                 :     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
    3239                 :     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
    3240                 :     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
    3241                 :     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
    3242                 :     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
    3243                 :     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
    3244                 :     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
    3245                 :     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
    3246                 :     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
    3247                 : #if 0
    3248                 :     /* FIXME: This code is commented out since it's apparently
    3249                 :      * not actually faster than the generic code.
    3250                 :      */
    3251                 :     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
    3252                 :     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
    3253                 :     PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
    3254                 :     PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
    3255                 : #endif
    3256                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
    3257                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
    3258                 :     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
    3259                 :     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
    3260                 :     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
    3261                 : 
    3262                 :     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
    3263                 :     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
    3264                 :     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
    3265                 :     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
    3266                 :     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
    3267                 :     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
    3268                 : 
    3269                 :     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
    3270                 :     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
    3271                 :     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8             ),
    3272                 :     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
    3273                 : 
    3274                 :     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
    3275                 :     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
    3276                 :     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
    3277                 :     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
    3278                 :     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
    3279                 :     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
    3280                 :     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
    3281                 :     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
    3282                 :     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
    3283                 :     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
    3284                 :     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
    3285                 :     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
    3286                 : 
    3287                 :     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
    3288                 :     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
    3289                 : 
    3290                 :     { PIXMAN_OP_NONE },
    3291                 : };
    3292                 : 
    3293                 : static pixman_bool_t
    3294               0 : mmx_blt (pixman_implementation_t *imp,
    3295                 :          uint32_t *               src_bits,
    3296                 :          uint32_t *               dst_bits,
    3297                 :          int                      src_stride,
    3298                 :          int                      dst_stride,
    3299                 :          int                      src_bpp,
    3300                 :          int                      dst_bpp,
    3301                 :          int                      src_x,
    3302                 :          int                      src_y,
    3303                 :          int                      dst_x,
    3304                 :          int                      dst_y,
    3305                 :          int                      width,
    3306                 :          int                      height)
    3307                 : {
    3308               0 :     if (!pixman_blt_mmx (
    3309                 :             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
    3310                 :             src_x, src_y, dst_x, dst_y, width, height))
    3311                 : 
    3312                 :     {
    3313               0 :         return _pixman_implementation_blt (
    3314                 :             imp->delegate,
    3315                 :             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
    3316                 :             src_x, src_y, dst_x, dst_y, width, height);
    3317                 :     }
    3318                 : 
    3319               0 :     return TRUE;
    3320                 : }
    3321                 : 
    3322                 : static pixman_bool_t
    3323               0 : mmx_fill (pixman_implementation_t *imp,
    3324                 :           uint32_t *               bits,
    3325                 :           int                      stride,
    3326                 :           int                      bpp,
    3327                 :           int                      x,
    3328                 :           int                      y,
    3329                 :           int                      width,
    3330                 :           int                      height,
    3331                 :           uint32_t xor)
    3332                 : {
    3333               0 :     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
    3334                 :     {
    3335               0 :         return _pixman_implementation_fill (
    3336                 :             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
    3337                 :     }
    3338                 : 
    3339               0 :     return TRUE;
    3340                 : }
    3341                 : 
    3342                 : pixman_implementation_t *
    3343               4 : _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
    3344                 : {
    3345               4 :     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
    3346                 : 
    3347               4 :     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
    3348               4 :     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
    3349               4 :     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
    3350               4 :     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
    3351               4 :     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
    3352               4 :     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
    3353               4 :     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
    3354               4 :     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
    3355               4 :     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
    3356               4 :     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
    3357               4 :     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
    3358                 : 
    3359               4 :     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
    3360               4 :     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
    3361               4 :     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
    3362               4 :     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
    3363               4 :     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
    3364               4 :     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
    3365               4 :     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
    3366               4 :     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
    3367               4 :     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
    3368               4 :     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
    3369               4 :     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
    3370                 : 
    3371               4 :     imp->blt = mmx_blt;
    3372               4 :     imp->fill = mmx_fill;
    3373                 : 
    3374               4 :     return imp;
    3375                 : }
    3376                 : 
    3377                 : #endif /* USE_MMX */

Generated by: LCOV version 1.7