LCOV - code coverage report
Current view: directory - gfx/cairo/libpixman/src - pixman-sse2.c (source / functions) Found Hit Coverage
Test: app.info Lines: 2223 208 9.4 %
Date: 2012-06-02 Functions: 68 9 13.2 %

       1                 : /*
       2                 :  * Copyright © 2008 Rodrigo Kumpera
       3                 :  * Copyright © 2008 André Tupinambá
       4                 :  *
       5                 :  * Permission to use, copy, modify, distribute, and sell this software and its
       6                 :  * documentation for any purpose is hereby granted without fee, provided that
       7                 :  * the above copyright notice appear in all copies and that both that
       8                 :  * copyright notice and this permission notice appear in supporting
       9                 :  * documentation, and that the name of Red Hat not be used in advertising or
      10                 :  * publicity pertaining to distribution of the software without specific,
      11                 :  * written prior permission.  Red Hat makes no representations about the
      12                 :  * suitability of this software for any purpose.  It is provided "as is"
      13                 :  * without express or implied warranty.
      14                 :  *
      15                 :  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
      16                 :  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      17                 :  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18                 :  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      19                 :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
      20                 :  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
      21                 :  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
      22                 :  * SOFTWARE.
      23                 :  *
      24                 :  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
      25                 :  *          André Tupinambá (andrelrt@gmail.com)
      26                 :  *
      27                 :  * Based on work by Owen Taylor and Søren Sandmann
      28                 :  */
      29                 : #ifdef HAVE_CONFIG_H
      30                 : #include <config.h>
      31                 : #endif
      32                 : 
      33                 : #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
      34                 : #include <emmintrin.h> /* for SSE2 intrinsics */
      35                 : #include "pixman-private.h"
      36                 : #include "pixman-combine32.h"
      37                 : #include "pixman-fast-path.h"
      38                 : 
      39                 : static __m128i mask_0080;
      40                 : static __m128i mask_00ff;
      41                 : static __m128i mask_0101;
      42                 : static __m128i mask_ffff;
      43                 : static __m128i mask_ff000000;
      44                 : static __m128i mask_alpha;
      45                 : 
      46                 : static __m128i mask_565_r;
      47                 : static __m128i mask_565_g1, mask_565_g2;
      48                 : static __m128i mask_565_b;
      49                 : static __m128i mask_red;
      50                 : static __m128i mask_green;
      51                 : static __m128i mask_blue;
      52                 : 
      53                 : static __m128i mask_565_fix_rb;
      54                 : static __m128i mask_565_fix_g;
      55                 : 
      56                 : static force_inline __m128i
      57                 : unpack_32_1x128 (uint32_t data)
      58                 : {
      59               0 :     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
      60                 : }
      61                 : 
      62                 : static force_inline void
      63                 : unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
      64                 : {
      65               0 :     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
      66               0 :     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
      67                 : }
      68                 : 
      69                 : static force_inline __m128i
      70                 : unpack_565_to_8888 (__m128i lo)
      71                 : {
      72                 :     __m128i r, g, b, rb, t;
      73                 : 
      74               0 :     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
      75               0 :     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
      76               0 :     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
      77                 : 
      78               0 :     rb = _mm_or_si128 (r, b);
      79               0 :     t  = _mm_and_si128 (rb, mask_565_fix_rb);
      80               0 :     t  = _mm_srli_epi32 (t, 5);
      81               0 :     rb = _mm_or_si128 (rb, t);
      82                 : 
      83               0 :     t  = _mm_and_si128 (g, mask_565_fix_g);
      84               0 :     t  = _mm_srli_epi32 (t, 6);
      85               0 :     g  = _mm_or_si128 (g, t);
      86                 : 
      87               0 :     return _mm_or_si128 (rb, g);
      88                 : }
      89                 : 
      90                 : static force_inline void
      91                 : unpack_565_128_4x128 (__m128i  data,
      92                 :                       __m128i* data0,
      93                 :                       __m128i* data1,
      94                 :                       __m128i* data2,
      95                 :                       __m128i* data3)
      96                 : {
      97                 :     __m128i lo, hi;
      98                 : 
      99               0 :     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
     100               0 :     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
     101                 : 
     102               0 :     lo = unpack_565_to_8888 (lo);
     103               0 :     hi = unpack_565_to_8888 (hi);
     104                 : 
     105               0 :     unpack_128_2x128 (lo, data0, data1);
     106               0 :     unpack_128_2x128 (hi, data2, data3);
     107                 : }
     108                 : 
     109                 : static force_inline uint16_t
     110                 : pack_565_32_16 (uint32_t pixel)
     111                 : {
     112               0 :     return (uint16_t) (((pixel >> 8) & 0xf800) |
     113               0 :                        ((pixel >> 5) & 0x07e0) |
     114               0 :                        ((pixel >> 3) & 0x001f));
     115                 : }
     116                 : 
     117                 : static force_inline __m128i
     118                 : pack_2x128_128 (__m128i lo, __m128i hi)
     119                 : {
     120                 :     return _mm_packus_epi16 (lo, hi);
     121                 : }
     122                 : 
     123                 : static force_inline __m128i
     124                 : pack_565_2x128_128 (__m128i lo, __m128i hi)
     125                 : {
     126                 :     __m128i data;
     127                 :     __m128i r, g1, g2, b;
     128                 : 
     129               0 :     data = pack_2x128_128 (lo, hi);
     130                 : 
     131               0 :     r  = _mm_and_si128 (data, mask_565_r);
     132               0 :     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
     133               0 :     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
     134               0 :     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
     135                 : 
     136               0 :     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
     137                 : }
     138                 : 
     139                 : static force_inline __m128i
     140                 : pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
     141                 : {
     142               0 :     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
     143                 :                              pack_565_2x128_128 (*xmm2, *xmm3));
     144                 : }
     145                 : 
     146                 : static force_inline int
     147                 : is_opaque (__m128i x)
     148                 : {
     149               0 :     __m128i ffs = _mm_cmpeq_epi8 (x, x);
     150                 : 
     151               0 :     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
     152                 : }
     153                 : 
     154                 : static force_inline int
     155                 : is_zero (__m128i x)
     156                 : {
     157               0 :     return _mm_movemask_epi8 (
     158                 :         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
     159                 : }
     160                 : 
     161                 : static force_inline int
     162                 : is_transparent (__m128i x)
     163                 : {
     164               0 :     return (_mm_movemask_epi8 (
     165               0 :                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
     166                 : }
     167                 : 
     168                 : static force_inline __m128i
     169                 : expand_pixel_32_1x128 (uint32_t data)
     170                 : {
     171               0 :     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
     172                 : }
     173                 : 
     174                 : static force_inline __m128i
     175                 : expand_alpha_1x128 (__m128i data)
     176                 : {
     177               0 :     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
     178                 :                                                      _MM_SHUFFLE (3, 3, 3, 3)),
     179                 :                                 _MM_SHUFFLE (3, 3, 3, 3));
     180                 : }
     181                 : 
     182                 : static force_inline void
     183                 : expand_alpha_2x128 (__m128i  data_lo,
     184                 :                     __m128i  data_hi,
     185                 :                     __m128i* alpha_lo,
     186                 :                     __m128i* alpha_hi)
     187                 : {
     188                 :     __m128i lo, hi;
     189                 : 
     190               0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
     191               0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
     192                 : 
     193               0 :     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
     194               0 :     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
     195                 : }
     196                 : 
     197                 : static force_inline void
     198                 : expand_alpha_rev_2x128 (__m128i  data_lo,
     199                 :                         __m128i  data_hi,
     200                 :                         __m128i* alpha_lo,
     201                 :                         __m128i* alpha_hi)
     202                 : {
     203                 :     __m128i lo, hi;
     204                 : 
     205               0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
     206               0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
     207               0 :     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
     208               0 :     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
     209                 : }
     210                 : 
     211                 : static force_inline void
     212                 : pix_multiply_2x128 (__m128i* data_lo,
     213                 :                     __m128i* data_hi,
     214                 :                     __m128i* alpha_lo,
     215                 :                     __m128i* alpha_hi,
     216                 :                     __m128i* ret_lo,
     217                 :                     __m128i* ret_hi)
     218                 : {
     219                 :     __m128i lo, hi;
     220                 : 
     221               0 :     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
     222               0 :     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
     223               0 :     lo = _mm_adds_epu16 (lo, mask_0080);
     224               0 :     hi = _mm_adds_epu16 (hi, mask_0080);
     225               0 :     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
     226               0 :     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
     227                 : }
     228                 : 
     229                 : static force_inline void
     230                 : pix_add_multiply_2x128 (__m128i* src_lo,
     231                 :                         __m128i* src_hi,
     232                 :                         __m128i* alpha_dst_lo,
     233                 :                         __m128i* alpha_dst_hi,
     234                 :                         __m128i* dst_lo,
     235                 :                         __m128i* dst_hi,
     236                 :                         __m128i* alpha_src_lo,
     237                 :                         __m128i* alpha_src_hi,
     238                 :                         __m128i* ret_lo,
     239                 :                         __m128i* ret_hi)
     240                 : {
     241                 :     __m128i t1_lo, t1_hi;
     242                 :     __m128i t2_lo, t2_hi;
     243                 : 
     244                 :     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
     245                 :     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
     246                 : 
     247               0 :     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
     248               0 :     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
     249                 : }
     250                 : 
     251                 : static force_inline void
     252                 : negate_2x128 (__m128i  data_lo,
     253                 :               __m128i  data_hi,
     254                 :               __m128i* neg_lo,
     255                 :               __m128i* neg_hi)
     256                 : {
     257               0 :     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
     258               0 :     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
     259                 : }
     260                 : 
     261                 : static force_inline void
     262                 : invert_colors_2x128 (__m128i  data_lo,
     263                 :                      __m128i  data_hi,
     264                 :                      __m128i* inv_lo,
     265                 :                      __m128i* inv_hi)
     266                 : {
     267                 :     __m128i lo, hi;
     268                 : 
     269               0 :     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
     270               0 :     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
     271               0 :     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
     272               0 :     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
     273                 : }
     274                 : 
     275                 : static force_inline void
     276                 : over_2x128 (__m128i* src_lo,
     277                 :             __m128i* src_hi,
     278                 :             __m128i* alpha_lo,
     279                 :             __m128i* alpha_hi,
     280                 :             __m128i* dst_lo,
     281                 :             __m128i* dst_hi)
     282                 : {
     283                 :     __m128i t1, t2;
     284                 : 
     285               0 :     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
     286                 : 
     287                 :     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
     288                 : 
     289               0 :     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
     290               0 :     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
     291                 : }
     292                 : 
     293                 : static force_inline void
     294                 : over_rev_non_pre_2x128 (__m128i  src_lo,
     295                 :                         __m128i  src_hi,
     296                 :                         __m128i* dst_lo,
     297                 :                         __m128i* dst_hi)
     298                 : {
     299                 :     __m128i lo, hi;
     300                 :     __m128i alpha_lo, alpha_hi;
     301                 : 
     302               0 :     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
     303                 : 
     304               0 :     lo = _mm_or_si128 (alpha_lo, mask_alpha);
     305               0 :     hi = _mm_or_si128 (alpha_hi, mask_alpha);
     306                 : 
     307               0 :     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
     308                 : 
     309                 :     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
     310                 : 
     311                 :     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
     312                 : }
     313                 : 
     314                 : static force_inline void
     315                 : in_over_2x128 (__m128i* src_lo,
     316                 :                __m128i* src_hi,
     317                 :                __m128i* alpha_lo,
     318                 :                __m128i* alpha_hi,
     319                 :                __m128i* mask_lo,
     320                 :                __m128i* mask_hi,
     321                 :                __m128i* dst_lo,
     322                 :                __m128i* dst_hi)
     323                 : {
     324                 :     __m128i s_lo, s_hi;
     325                 :     __m128i a_lo, a_hi;
     326                 : 
     327                 :     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
     328                 :     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
     329                 : 
     330                 :     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
     331                 : }
     332                 : 
     333                 : /* load 4 pixels from a 16-byte boundary aligned address */
     334                 : static force_inline __m128i
     335                 : load_128_aligned (__m128i* src)
     336                 : {
     337                 :     return _mm_load_si128 (src);
     338                 : }
     339                 : 
     340                 : /* load 4 pixels from a unaligned address */
     341                 : static force_inline __m128i
     342                 : load_128_unaligned (const __m128i* src)
     343                 : {
     344                 :     return _mm_loadu_si128 (src);
     345                 : }
     346                 : 
     347                 : /* save 4 pixels using Write Combining memory on a 16-byte
     348                 :  * boundary aligned address
     349                 :  */
     350                 : static force_inline void
     351                 : save_128_write_combining (__m128i* dst,
     352                 :                           __m128i  data)
     353                 : {
     354                 :     _mm_stream_si128 (dst, data);
     355                 : }
     356                 : 
     357                 : /* save 4 pixels on a 16-byte boundary aligned address */
     358                 : static force_inline void
     359                 : save_128_aligned (__m128i* dst,
     360                 :                   __m128i  data)
     361                 : {
     362                 :     _mm_store_si128 (dst, data);
     363                 : }
     364                 : 
     365                 : /* save 4 pixels on a unaligned address */
     366                 : static force_inline void
     367                 : save_128_unaligned (__m128i* dst,
     368                 :                     __m128i  data)
     369                 : {
     370                 :     _mm_storeu_si128 (dst, data);
     371                 : }
     372                 : 
     373                 : static force_inline __m128i
     374                 : load_32_1x128 (uint32_t data)
     375                 : {
     376               0 :     return _mm_cvtsi32_si128 (data);
     377                 : }
     378                 : 
     379                 : static force_inline __m128i
     380                 : expand_alpha_rev_1x128 (__m128i data)
     381                 : {
     382               0 :     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
     383                 : }
     384                 : 
     385                 : static force_inline __m128i
     386                 : expand_pixel_8_1x128 (uint8_t data)
     387                 : {
     388               0 :     return _mm_shufflelo_epi16 (
     389                 :         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
     390                 : }
     391                 : 
     392                 : static force_inline __m128i
     393                 : pix_multiply_1x128 (__m128i data,
     394                 :                     __m128i alpha)
     395                 : {
     396               0 :     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
     397                 :                                             mask_0080),
     398                 :                             mask_0101);
     399                 : }
     400                 : 
     401                 : static force_inline __m128i
     402                 : pix_add_multiply_1x128 (__m128i* src,
     403                 :                         __m128i* alpha_dst,
     404                 :                         __m128i* dst,
     405                 :                         __m128i* alpha_src)
     406                 : {
     407               0 :     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
     408               0 :     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
     409                 : 
     410               0 :     return _mm_adds_epu8 (t1, t2);
     411                 : }
     412                 : 
     413                 : static force_inline __m128i
     414                 : negate_1x128 (__m128i data)
     415                 : {
     416               0 :     return _mm_xor_si128 (data, mask_00ff);
     417                 : }
     418                 : 
     419                 : static force_inline __m128i
     420                 : invert_colors_1x128 (__m128i data)
     421                 : {
     422               0 :     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
     423                 : }
     424                 : 
     425                 : static force_inline __m128i
     426                 : over_1x128 (__m128i src, __m128i alpha, __m128i dst)
     427                 : {
     428                 :     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
     429                 : }
     430                 : 
     431                 : static force_inline __m128i
     432                 : in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
     433                 : {
     434               0 :     return over_1x128 (pix_multiply_1x128 (*src, *mask),
     435                 :                        pix_multiply_1x128 (*alpha, *mask),
     436                 :                        *dst);
     437                 : }
     438                 : 
     439                 : static force_inline __m128i
     440                 : over_rev_non_pre_1x128 (__m128i src, __m128i dst)
     441                 : {
     442               0 :     __m128i alpha = expand_alpha_1x128 (src);
     443                 : 
     444               0 :     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
     445                 :                                            _mm_or_si128 (alpha, mask_alpha)),
     446                 :                        alpha,
     447                 :                        dst);
     448                 : }
     449                 : 
     450                 : static force_inline uint32_t
     451                 : pack_1x128_32 (__m128i data)
     452                 : {
     453               0 :     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
     454                 : }
     455                 : 
     456                 : static force_inline __m128i
     457                 : expand565_16_1x128 (uint16_t pixel)
     458                 : {
     459               0 :     __m128i m = _mm_cvtsi32_si128 (pixel);
     460                 : 
     461               0 :     m = unpack_565_to_8888 (m);
     462                 : 
     463               0 :     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
     464                 : }
     465                 : 
     466                 : static force_inline uint32_t
     467                 : core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
     468                 : {
     469                 :     uint8_t a;
     470                 :     __m128i xmms;
     471                 : 
     472               0 :     a = src >> 24;
     473                 : 
     474               0 :     if (a == 0xff)
     475                 :     {
     476               0 :         return src;
     477                 :     }
     478               0 :     else if (src)
     479                 :     {
     480               0 :         xmms = unpack_32_1x128 (src);
     481               0 :         return pack_1x128_32 (
     482                 :             over_1x128 (xmms, expand_alpha_1x128 (xmms),
     483                 :                         unpack_32_1x128 (dst)));
     484                 :     }
     485                 : 
     486               0 :     return dst;
     487                 : }
     488                 : 
     489                 : static force_inline uint32_t
     490                 : combine1 (const uint32_t *ps, const uint32_t *pm)
     491                 : {
     492               0 :     uint32_t s = *ps;
     493                 : 
     494               0 :     if (pm)
     495                 :     {
     496                 :         __m128i ms, mm;
     497                 : 
     498               0 :         mm = unpack_32_1x128 (*pm);
     499               0 :         mm = expand_alpha_1x128 (mm);
     500                 : 
     501               0 :         ms = unpack_32_1x128 (s);
     502               0 :         ms = pix_multiply_1x128 (ms, mm);
     503                 : 
     504               0 :         s = pack_1x128_32 (ms);
     505                 :     }
     506                 : 
     507               0 :     return s;
     508                 : }
     509                 : 
     510                 : static force_inline __m128i
     511                 : combine4 (const __m128i *ps, const __m128i *pm)
     512                 : {
     513                 :     __m128i xmm_src_lo, xmm_src_hi;
     514                 :     __m128i xmm_msk_lo, xmm_msk_hi;
     515                 :     __m128i s;
     516                 : 
     517               0 :     if (pm)
     518                 :     {
     519               0 :         xmm_msk_lo = load_128_unaligned (pm);
     520                 : 
     521               0 :         if (is_transparent (xmm_msk_lo))
     522                 :             return _mm_setzero_si128 ();
     523                 :     }
     524                 : 
     525               0 :     s = load_128_unaligned (ps);
     526                 : 
     527               0 :     if (pm)
     528                 :     {
     529               0 :         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
     530               0 :         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
     531                 : 
     532               0 :         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
     533                 : 
     534                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
     535                 :                             &xmm_msk_lo, &xmm_msk_hi,
     536                 :                             &xmm_src_lo, &xmm_src_hi);
     537                 : 
     538               0 :         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
     539                 :     }
     540                 : 
     541               0 :     return s;
     542                 : }
     543                 : 
     544                 : static force_inline void
     545                 : core_combine_over_u_sse2_mask (uint32_t *         pd,
     546                 :                                const uint32_t*    ps,
     547                 :                                const uint32_t*    pm,
     548                 :                                int                w)
     549                 : {
     550                 :     uint32_t s, d;
     551                 : 
     552                 :     /* Align dst on a 16-byte boundary */
     553               0 :     while (w && ((unsigned long)pd & 15))
     554                 :     {
     555               0 :         d = *pd;
     556                 :         s = combine1 (ps, pm);
     557                 : 
     558               0 :         if (s)
     559               0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     560               0 :         pd++;
     561               0 :         ps++;
     562               0 :         pm++;
     563               0 :         w--;
     564                 :     }
     565                 : 
     566               0 :     while (w >= 4)
     567                 :     {
     568               0 :         __m128i mask = load_128_unaligned ((__m128i *)pm);
     569                 : 
     570               0 :         if (!is_zero (mask))
     571                 :         {
     572                 :             __m128i src;
     573                 :             __m128i src_hi, src_lo;
     574                 :             __m128i mask_hi, mask_lo;
     575                 :             __m128i alpha_hi, alpha_lo;
     576                 : 
     577               0 :             src = load_128_unaligned ((__m128i *)ps);
     578                 : 
     579               0 :             if (is_opaque (_mm_and_si128 (src, mask)))
     580                 :             {
     581               0 :                 save_128_aligned ((__m128i *)pd, src);
     582                 :             }
     583                 :             else
     584                 :             {
     585               0 :                 __m128i dst = load_128_aligned ((__m128i *)pd);
     586                 :                 __m128i dst_hi, dst_lo;
     587                 : 
     588               0 :                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
     589               0 :                 unpack_128_2x128 (src, &src_lo, &src_hi);
     590                 : 
     591               0 :                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
     592                 :                 pix_multiply_2x128 (&src_lo, &src_hi,
     593                 :                                     &mask_lo, &mask_hi,
     594                 :                                     &src_lo, &src_hi);
     595                 : 
     596               0 :                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
     597                 : 
     598               0 :                 expand_alpha_2x128 (src_lo, src_hi,
     599                 :                                     &alpha_lo, &alpha_hi);
     600                 : 
     601                 :                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
     602                 :                             &dst_lo, &dst_hi);
     603                 : 
     604               0 :                 save_128_aligned (
     605                 :                     (__m128i *)pd,
     606                 :                     pack_2x128_128 (dst_lo, dst_hi));
     607                 :             }
     608                 :         }
     609                 : 
     610               0 :         pm += 4;
     611               0 :         ps += 4;
     612               0 :         pd += 4;
     613               0 :         w -= 4;
     614                 :     }
     615               0 :     while (w)
     616                 :     {
     617               0 :         d = *pd;
     618                 :         s = combine1 (ps, pm);
     619                 : 
     620               0 :         if (s)
     621               0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     622               0 :         pd++;
     623               0 :         ps++;
     624               0 :         pm++;
     625                 : 
     626               0 :         w--;
     627                 :     }
     628                 : }
     629                 : 
     630                 : static force_inline void
     631                 : core_combine_over_u_sse2_no_mask (uint32_t *      pd,
     632                 :                                   const uint32_t*    ps,
     633                 :                                   int                w)
     634                 : {
     635                 :     uint32_t s, d;
     636                 : 
     637                 :     /* Align dst on a 16-byte boundary */
     638               0 :     while (w && ((unsigned long)pd & 15))
     639                 :     {
     640               0 :         d = *pd;
     641               0 :         s = *ps;
     642                 : 
     643               0 :         if (s)
     644               0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     645               0 :         pd++;
     646               0 :         ps++;
     647               0 :         w--;
     648                 :     }
     649                 : 
     650               0 :     while (w >= 4)
     651                 :     {
     652                 :         __m128i src;
     653                 :         __m128i src_hi, src_lo, dst_hi, dst_lo;
     654                 :         __m128i alpha_hi, alpha_lo;
     655                 : 
     656               0 :         src = load_128_unaligned ((__m128i *)ps);
     657                 : 
     658               0 :         if (!is_zero (src))
     659                 :         {
     660               0 :             if (is_opaque (src))
     661                 :             {
     662               0 :                 save_128_aligned ((__m128i *)pd, src);
     663                 :             }
     664                 :             else
     665                 :             {
     666               0 :                 __m128i dst = load_128_aligned ((__m128i *)pd);
     667                 : 
     668               0 :                 unpack_128_2x128 (src, &src_lo, &src_hi);
     669               0 :                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
     670                 : 
     671               0 :                 expand_alpha_2x128 (src_lo, src_hi,
     672                 :                                     &alpha_lo, &alpha_hi);
     673                 :                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
     674                 :                             &dst_lo, &dst_hi);
     675                 : 
     676               0 :                 save_128_aligned (
     677                 :                     (__m128i *)pd,
     678                 :                     pack_2x128_128 (dst_lo, dst_hi));
     679                 :             }
     680                 :         }
     681                 : 
     682               0 :         ps += 4;
     683               0 :         pd += 4;
     684               0 :         w -= 4;
     685                 :     }
     686               0 :     while (w)
     687                 :     {
     688               0 :         d = *pd;
     689               0 :         s = *ps;
     690                 : 
     691               0 :         if (s)
     692               0 :             *pd = core_combine_over_u_pixel_sse2 (s, d);
     693               0 :         pd++;
     694               0 :         ps++;
     695                 : 
     696               0 :         w--;
     697                 :     }
     698                 : }
     699                 : 
     700                 : static force_inline void
     701               0 : sse2_combine_over_u (pixman_implementation_t *imp,
     702                 :                      pixman_op_t              op,
     703                 :                      uint32_t *               pd,
     704                 :                      const uint32_t *         ps,
     705                 :                      const uint32_t *         pm,
     706                 :                      int                      w)
     707                 : {
     708               0 :     if (pm)
     709                 :         core_combine_over_u_sse2_mask (pd, ps, pm, w);
     710                 :     else
     711                 :         core_combine_over_u_sse2_no_mask (pd, ps, w);
     712               0 : }
     713                 : 
     714                 : static void
     715               0 : sse2_combine_over_reverse_u (pixman_implementation_t *imp,
     716                 :                              pixman_op_t              op,
     717                 :                              uint32_t *               pd,
     718                 :                              const uint32_t *         ps,
     719                 :                              const uint32_t *         pm,
     720                 :                              int                      w)
     721                 : {
     722                 :     uint32_t s, d;
     723                 : 
     724                 :     __m128i xmm_dst_lo, xmm_dst_hi;
     725                 :     __m128i xmm_src_lo, xmm_src_hi;
     726                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
     727                 : 
     728                 :     /* Align dst on a 16-byte boundary */
     729               0 :     while (w &&
     730               0 :            ((unsigned long)pd & 15))
     731                 :     {
     732               0 :         d = *pd;
     733                 :         s = combine1 (ps, pm);
     734                 : 
     735               0 :         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
     736               0 :         w--;
     737               0 :         ps++;
     738               0 :         if (pm)
     739               0 :             pm++;
     740                 :     }
     741                 : 
     742               0 :     while (w >= 4)
     743                 :     {
     744                 :         /* I'm loading unaligned because I'm not sure
     745                 :          * about the address alignment.
     746                 :          */
     747               0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
     748               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     749                 : 
     750               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     751               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     752                 : 
     753               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
     754                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
     755                 : 
     756                 :         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     757                 :                     &xmm_alpha_lo, &xmm_alpha_hi,
     758                 :                     &xmm_src_lo, &xmm_src_hi);
     759                 : 
     760                 :         /* rebuid the 4 pixel data and save*/
     761               0 :         save_128_aligned ((__m128i*)pd,
     762                 :                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
     763                 : 
     764               0 :         w -= 4;
     765               0 :         ps += 4;
     766               0 :         pd += 4;
     767                 : 
     768               0 :         if (pm)
     769               0 :             pm += 4;
     770                 :     }
     771                 : 
     772               0 :     while (w)
     773                 :     {
     774               0 :         d = *pd;
     775                 :         s = combine1 (ps, pm);
     776                 : 
     777               0 :         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
     778               0 :         ps++;
     779               0 :         w--;
     780               0 :         if (pm)
     781               0 :             pm++;
     782                 :     }
     783               0 : }
     784                 : 
     785                 : static force_inline uint32_t
     786                 : core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     787                 : {
     788               0 :     uint32_t maska = src >> 24;
     789                 : 
     790               0 :     if (maska == 0)
     791                 :     {
     792               0 :         return 0;
     793                 :     }
     794               0 :     else if (maska != 0xff)
     795                 :     {
     796                 :         return pack_1x128_32 (
     797                 :             pix_multiply_1x128 (unpack_32_1x128 (dst),
     798                 :                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
     799                 :     }
     800                 : 
     801               0 :     return dst;
     802                 : }
     803                 : 
     804                 : static void
     805               0 : sse2_combine_in_u (pixman_implementation_t *imp,
     806                 :                    pixman_op_t              op,
     807                 :                    uint32_t *               pd,
     808                 :                    const uint32_t *         ps,
     809                 :                    const uint32_t *         pm,
     810                 :                    int                      w)
     811                 : {
     812                 :     uint32_t s, d;
     813                 : 
     814                 :     __m128i xmm_src_lo, xmm_src_hi;
     815                 :     __m128i xmm_dst_lo, xmm_dst_hi;
     816                 : 
     817               0 :     while (w && ((unsigned long) pd & 15))
     818                 :     {
     819                 :         s = combine1 (ps, pm);
     820               0 :         d = *pd;
     821                 : 
     822               0 :         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
     823               0 :         w--;
     824               0 :         ps++;
     825               0 :         if (pm)
     826               0 :             pm++;
     827                 :     }
     828                 : 
     829               0 :     while (w >= 4)
     830                 :     {
     831               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     832               0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
     833                 : 
     834               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     835               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     836                 : 
     837               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     838                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
     839                 :                             &xmm_dst_lo, &xmm_dst_hi,
     840                 :                             &xmm_dst_lo, &xmm_dst_hi);
     841                 : 
     842               0 :         save_128_aligned ((__m128i*)pd,
     843                 :                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     844                 : 
     845               0 :         ps += 4;
     846               0 :         pd += 4;
     847               0 :         w -= 4;
     848               0 :         if (pm)
     849               0 :             pm += 4;
     850                 :     }
     851                 : 
     852               0 :     while (w)
     853                 :     {
     854                 :         s = combine1 (ps, pm);
     855               0 :         d = *pd;
     856                 : 
     857               0 :         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
     858               0 :         w--;
     859               0 :         ps++;
     860               0 :         if (pm)
     861               0 :             pm++;
     862                 :     }
     863               0 : }
     864                 : 
     865                 : static void
     866               0 : sse2_combine_in_reverse_u (pixman_implementation_t *imp,
     867                 :                            pixman_op_t              op,
     868                 :                            uint32_t *               pd,
     869                 :                            const uint32_t *         ps,
     870                 :                            const uint32_t *         pm,
     871                 :                            int                      w)
     872                 : {
     873                 :     uint32_t s, d;
     874                 : 
     875                 :     __m128i xmm_src_lo, xmm_src_hi;
     876                 :     __m128i xmm_dst_lo, xmm_dst_hi;
     877                 : 
     878               0 :     while (w && ((unsigned long) pd & 15))
     879                 :     {
     880                 :         s = combine1 (ps, pm);
     881               0 :         d = *pd;
     882                 : 
     883               0 :         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
     884               0 :         ps++;
     885               0 :         w--;
     886               0 :         if (pm)
     887               0 :             pm++;
     888                 :     }
     889                 : 
     890               0 :     while (w >= 4)
     891                 :     {
     892               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     893               0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
     894                 : 
     895               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     896               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     897                 : 
     898               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     899                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     900                 :                             &xmm_src_lo, &xmm_src_hi,
     901                 :                             &xmm_dst_lo, &xmm_dst_hi);
     902                 : 
     903               0 :         save_128_aligned (
     904                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     905                 : 
     906               0 :         ps += 4;
     907               0 :         pd += 4;
     908               0 :         w -= 4;
     909               0 :         if (pm)
     910               0 :             pm += 4;
     911                 :     }
     912                 : 
     913               0 :     while (w)
     914                 :     {
     915                 :         s = combine1 (ps, pm);
     916               0 :         d = *pd;
     917                 : 
     918               0 :         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
     919               0 :         w--;
     920               0 :         ps++;
     921               0 :         if (pm)
     922               0 :             pm++;
     923                 :     }
     924               0 : }
     925                 : 
     926                 : static void
     927               0 : sse2_combine_out_reverse_u (pixman_implementation_t *imp,
     928                 :                             pixman_op_t              op,
     929                 :                             uint32_t *               pd,
     930                 :                             const uint32_t *         ps,
     931                 :                             const uint32_t *         pm,
     932                 :                             int                      w)
     933                 : {
     934               0 :     while (w && ((unsigned long) pd & 15))
     935                 :     {
     936                 :         uint32_t s = combine1 (ps, pm);
     937               0 :         uint32_t d = *pd;
     938                 : 
     939               0 :         *pd++ = pack_1x128_32 (
     940                 :             pix_multiply_1x128 (
     941                 :                 unpack_32_1x128 (d), negate_1x128 (
     942                 :                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
     943                 : 
     944               0 :         if (pm)
     945               0 :             pm++;
     946               0 :         ps++;
     947               0 :         w--;
     948                 :     }
     949                 : 
     950               0 :     while (w >= 4)
     951                 :     {
     952                 :         __m128i xmm_src_lo, xmm_src_hi;
     953                 :         __m128i xmm_dst_lo, xmm_dst_hi;
     954                 : 
     955               0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
     956               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
     957                 : 
     958               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     959               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
     960                 : 
     961               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     962               0 :         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
     963                 : 
     964                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
     965                 :                             &xmm_src_lo, &xmm_src_hi,
     966                 :                             &xmm_dst_lo, &xmm_dst_hi);
     967                 : 
     968               0 :         save_128_aligned (
     969                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
     970                 : 
     971               0 :         ps += 4;
     972               0 :         pd += 4;
     973               0 :         if (pm)
     974               0 :             pm += 4;
     975                 : 
     976               0 :         w -= 4;
     977                 :     }
     978                 : 
     979               0 :     while (w)
     980                 :     {
     981                 :         uint32_t s = combine1 (ps, pm);
     982               0 :         uint32_t d = *pd;
     983                 : 
     984               0 :         *pd++ = pack_1x128_32 (
     985                 :             pix_multiply_1x128 (
     986                 :                 unpack_32_1x128 (d), negate_1x128 (
     987                 :                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
     988               0 :         ps++;
     989               0 :         if (pm)
     990               0 :             pm++;
     991               0 :         w--;
     992                 :     }
     993               0 : }
     994                 : 
     995                 : static void
     996               0 : sse2_combine_out_u (pixman_implementation_t *imp,
     997                 :                     pixman_op_t              op,
     998                 :                     uint32_t *               pd,
     999                 :                     const uint32_t *         ps,
    1000                 :                     const uint32_t *         pm,
    1001                 :                     int                      w)
    1002                 : {
    1003               0 :     while (w && ((unsigned long) pd & 15))
    1004                 :     {
    1005                 :         uint32_t s = combine1 (ps, pm);
    1006               0 :         uint32_t d = *pd;
    1007                 : 
    1008               0 :         *pd++ = pack_1x128_32 (
    1009                 :             pix_multiply_1x128 (
    1010                 :                 unpack_32_1x128 (s), negate_1x128 (
    1011                 :                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1012               0 :         w--;
    1013               0 :         ps++;
    1014               0 :         if (pm)
    1015               0 :             pm++;
    1016                 :     }
    1017                 : 
    1018               0 :     while (w >= 4)
    1019                 :     {
    1020                 :         __m128i xmm_src_lo, xmm_src_hi;
    1021                 :         __m128i xmm_dst_lo, xmm_dst_hi;
    1022                 : 
    1023               0 :         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
    1024               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1025                 : 
    1026               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1027               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1028                 : 
    1029               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1030               0 :         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1031                 : 
    1032                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1033                 :                             &xmm_dst_lo, &xmm_dst_hi,
    1034                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1035                 : 
    1036               0 :         save_128_aligned (
    1037                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1038                 : 
    1039               0 :         ps += 4;
    1040               0 :         pd += 4;
    1041               0 :         w -= 4;
    1042               0 :         if (pm)
    1043               0 :             pm += 4;
    1044                 :     }
    1045                 : 
    1046               0 :     while (w)
    1047                 :     {
    1048                 :         uint32_t s = combine1 (ps, pm);
    1049               0 :         uint32_t d = *pd;
    1050                 : 
    1051               0 :         *pd++ = pack_1x128_32 (
    1052                 :             pix_multiply_1x128 (
    1053                 :                 unpack_32_1x128 (s), negate_1x128 (
    1054                 :                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1055               0 :         w--;
    1056               0 :         ps++;
    1057               0 :         if (pm)
    1058               0 :             pm++;
    1059                 :     }
    1060               0 : }
    1061                 : 
    1062                 : static force_inline uint32_t
    1063                 : core_combine_atop_u_pixel_sse2 (uint32_t src,
    1064                 :                                 uint32_t dst)
    1065                 : {
    1066               0 :     __m128i s = unpack_32_1x128 (src);
    1067               0 :     __m128i d = unpack_32_1x128 (dst);
    1068                 : 
    1069               0 :     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
    1070               0 :     __m128i da = expand_alpha_1x128 (d);
    1071                 : 
    1072                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
    1073                 : }
    1074                 : 
    1075                 : static void
    1076               0 : sse2_combine_atop_u (pixman_implementation_t *imp,
    1077                 :                      pixman_op_t              op,
    1078                 :                      uint32_t *               pd,
    1079                 :                      const uint32_t *         ps,
    1080                 :                      const uint32_t *         pm,
    1081                 :                      int                      w)
    1082                 : {
    1083                 :     uint32_t s, d;
    1084                 : 
    1085                 :     __m128i xmm_src_lo, xmm_src_hi;
    1086                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1087                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1088                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1089                 : 
    1090               0 :     while (w && ((unsigned long) pd & 15))
    1091                 :     {
    1092                 :         s = combine1 (ps, pm);
    1093               0 :         d = *pd;
    1094                 : 
    1095               0 :         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
    1096               0 :         w--;
    1097               0 :         ps++;
    1098               0 :         if (pm)
    1099               0 :             pm++;
    1100                 :     }
    1101                 : 
    1102               0 :     while (w >= 4)
    1103                 :     {
    1104               0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    1105               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1106                 : 
    1107               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1108               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1109                 : 
    1110               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1111                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1112               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1113                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1114                 : 
    1115               0 :         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
    1116                 :                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1117                 : 
    1118                 :         pix_add_multiply_2x128 (
    1119                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1120                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1121                 :             &xmm_dst_lo, &xmm_dst_hi);
    1122                 : 
    1123               0 :         save_128_aligned (
    1124                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1125                 : 
    1126               0 :         ps += 4;
    1127               0 :         pd += 4;
    1128               0 :         w -= 4;
    1129               0 :         if (pm)
    1130               0 :             pm += 4;
    1131                 :     }
    1132                 : 
    1133               0 :     while (w)
    1134                 :     {
    1135                 :         s = combine1 (ps, pm);
    1136               0 :         d = *pd;
    1137                 : 
    1138               0 :         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
    1139               0 :         w--;
    1140               0 :         ps++;
    1141               0 :         if (pm)
    1142               0 :             pm++;
    1143                 :     }
    1144               0 : }
    1145                 : 
    1146                 : static force_inline uint32_t
    1147                 : core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
    1148                 :                                         uint32_t dst)
    1149                 : {
    1150               0 :     __m128i s = unpack_32_1x128 (src);
    1151               0 :     __m128i d = unpack_32_1x128 (dst);
    1152                 : 
    1153               0 :     __m128i sa = expand_alpha_1x128 (s);
    1154               0 :     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
    1155                 : 
    1156                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
    1157                 : }
    1158                 : 
    1159                 : static void
    1160               0 : sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
    1161                 :                              pixman_op_t              op,
    1162                 :                              uint32_t *               pd,
    1163                 :                              const uint32_t *         ps,
    1164                 :                              const uint32_t *         pm,
    1165                 :                              int                      w)
    1166                 : {
    1167                 :     uint32_t s, d;
    1168                 : 
    1169                 :     __m128i xmm_src_lo, xmm_src_hi;
    1170                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1171                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1172                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1173                 : 
    1174               0 :     while (w && ((unsigned long) pd & 15))
    1175                 :     {
    1176                 :         s = combine1 (ps, pm);
    1177               0 :         d = *pd;
    1178                 : 
    1179               0 :         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
    1180               0 :         ps++;
    1181               0 :         w--;
    1182               0 :         if (pm)
    1183               0 :             pm++;
    1184                 :     }
    1185                 : 
    1186               0 :     while (w >= 4)
    1187                 :     {
    1188               0 :         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    1189               0 :         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    1190                 : 
    1191               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1192               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1193                 : 
    1194               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1195                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1196               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1197                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1198                 : 
    1199               0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    1200                 :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1201                 : 
    1202                 :         pix_add_multiply_2x128 (
    1203                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1204                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1205                 :             &xmm_dst_lo, &xmm_dst_hi);
    1206                 : 
    1207               0 :         save_128_aligned (
    1208                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1209                 : 
    1210               0 :         ps += 4;
    1211               0 :         pd += 4;
    1212               0 :         w -= 4;
    1213               0 :         if (pm)
    1214               0 :             pm += 4;
    1215                 :     }
    1216                 : 
    1217               0 :     while (w)
    1218                 :     {
    1219                 :         s = combine1 (ps, pm);
    1220               0 :         d = *pd;
    1221                 : 
    1222               0 :         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
    1223               0 :         ps++;
    1224               0 :         w--;
    1225               0 :         if (pm)
    1226               0 :             pm++;
    1227                 :     }
    1228               0 : }
    1229                 : 
    1230                 : static force_inline uint32_t
    1231                 : core_combine_xor_u_pixel_sse2 (uint32_t src,
    1232                 :                                uint32_t dst)
    1233                 : {
    1234               0 :     __m128i s = unpack_32_1x128 (src);
    1235               0 :     __m128i d = unpack_32_1x128 (dst);
    1236                 : 
    1237               0 :     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
    1238               0 :     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
    1239                 : 
    1240                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
    1241                 : }
    1242                 : 
    1243                 : static void
    1244               0 : sse2_combine_xor_u (pixman_implementation_t *imp,
    1245                 :                     pixman_op_t              op,
    1246                 :                     uint32_t *               dst,
    1247                 :                     const uint32_t *         src,
    1248                 :                     const uint32_t *         mask,
    1249                 :                     int                      width)
    1250                 : {
    1251               0 :     int w = width;
    1252                 :     uint32_t s, d;
    1253               0 :     uint32_t* pd = dst;
    1254               0 :     const uint32_t* ps = src;
    1255               0 :     const uint32_t* pm = mask;
    1256                 : 
    1257                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    1258                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    1259                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    1260                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    1261                 : 
    1262               0 :     while (w && ((unsigned long) pd & 15))
    1263                 :     {
    1264                 :         s = combine1 (ps, pm);
    1265               0 :         d = *pd;
    1266                 : 
    1267               0 :         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
    1268               0 :         w--;
    1269               0 :         ps++;
    1270               0 :         if (pm)
    1271               0 :             pm++;
    1272                 :     }
    1273                 : 
    1274               0 :     while (w >= 4)
    1275                 :     {
    1276               0 :         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
    1277               0 :         xmm_dst = load_128_aligned ((__m128i*) pd);
    1278                 : 
    1279               0 :         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    1280               0 :         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    1281                 : 
    1282               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1283                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1284               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1285                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1286                 : 
    1287               0 :         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
    1288                 :                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    1289               0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    1290                 :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    1291                 : 
    1292                 :         pix_add_multiply_2x128 (
    1293                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    1294                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    1295                 :             &xmm_dst_lo, &xmm_dst_hi);
    1296                 : 
    1297               0 :         save_128_aligned (
    1298                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1299                 : 
    1300               0 :         ps += 4;
    1301               0 :         pd += 4;
    1302               0 :         w -= 4;
    1303               0 :         if (pm)
    1304               0 :             pm += 4;
    1305                 :     }
    1306                 : 
    1307               0 :     while (w)
    1308                 :     {
    1309                 :         s = combine1 (ps, pm);
    1310               0 :         d = *pd;
    1311                 : 
    1312               0 :         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
    1313               0 :         w--;
    1314               0 :         ps++;
    1315               0 :         if (pm)
    1316               0 :             pm++;
    1317                 :     }
    1318               0 : }
    1319                 : 
    1320                 : static force_inline void
    1321               0 : sse2_combine_add_u (pixman_implementation_t *imp,
    1322                 :                     pixman_op_t              op,
    1323                 :                     uint32_t *               dst,
    1324                 :                     const uint32_t *         src,
    1325                 :                     const uint32_t *         mask,
    1326                 :                     int                      width)
    1327                 : {
    1328               0 :     int w = width;
    1329                 :     uint32_t s, d;
    1330               0 :     uint32_t* pd = dst;
    1331               0 :     const uint32_t* ps = src;
    1332               0 :     const uint32_t* pm = mask;
    1333                 : 
    1334               0 :     while (w && (unsigned long)pd & 15)
    1335                 :     {
    1336                 :         s = combine1 (ps, pm);
    1337               0 :         d = *pd;
    1338                 : 
    1339               0 :         ps++;
    1340               0 :         if (pm)
    1341               0 :             pm++;
    1342               0 :         *pd++ = _mm_cvtsi128_si32 (
    1343                 :             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
    1344               0 :         w--;
    1345                 :     }
    1346                 : 
    1347               0 :     while (w >= 4)
    1348                 :     {
    1349                 :         __m128i s;
    1350                 : 
    1351               0 :         s = combine4 ((__m128i*)ps, (__m128i*)pm);
    1352                 : 
    1353               0 :         save_128_aligned (
    1354                 :             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
    1355                 : 
    1356               0 :         pd += 4;
    1357               0 :         ps += 4;
    1358               0 :         if (pm)
    1359               0 :             pm += 4;
    1360               0 :         w -= 4;
    1361                 :     }
    1362                 : 
    1363               0 :     while (w--)
    1364                 :     {
    1365                 :         s = combine1 (ps, pm);
    1366               0 :         d = *pd;
    1367                 : 
    1368               0 :         ps++;
    1369               0 :         *pd++ = _mm_cvtsi128_si32 (
    1370                 :             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
    1371               0 :         if (pm)
    1372               0 :             pm++;
    1373                 :     }
    1374               0 : }
    1375                 : 
    1376                 : static force_inline uint32_t
    1377                 : core_combine_saturate_u_pixel_sse2 (uint32_t src,
    1378                 :                                     uint32_t dst)
    1379                 : {
    1380               0 :     __m128i ms = unpack_32_1x128 (src);
    1381               0 :     __m128i md = unpack_32_1x128 (dst);
    1382               0 :     uint32_t sa = src >> 24;
    1383               0 :     uint32_t da = ~dst >> 24;
    1384                 : 
    1385               0 :     if (sa > da)
    1386                 :     {
    1387               0 :         ms = pix_multiply_1x128 (
    1388               0 :             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
    1389                 :     }
    1390                 : 
    1391               0 :     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
    1392                 : }
    1393                 : 
    1394                 : static void
    1395               0 : sse2_combine_saturate_u (pixman_implementation_t *imp,
    1396                 :                          pixman_op_t              op,
    1397                 :                          uint32_t *               pd,
    1398                 :                          const uint32_t *         ps,
    1399                 :                          const uint32_t *         pm,
    1400                 :                          int                      w)
    1401                 : {
    1402                 :     uint32_t s, d;
    1403                 : 
    1404                 :     uint32_t pack_cmp;
    1405                 :     __m128i xmm_src, xmm_dst;
    1406                 : 
    1407               0 :     while (w && (unsigned long)pd & 15)
    1408                 :     {
    1409                 :         s = combine1 (ps, pm);
    1410               0 :         d = *pd;
    1411                 : 
    1412               0 :         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1413               0 :         w--;
    1414               0 :         ps++;
    1415               0 :         if (pm)
    1416               0 :             pm++;
    1417                 :     }
    1418                 : 
    1419               0 :     while (w >= 4)
    1420                 :     {
    1421               0 :         xmm_dst = load_128_aligned  ((__m128i*)pd);
    1422               0 :         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
    1423                 : 
    1424               0 :         pack_cmp = _mm_movemask_epi8 (
    1425                 :             _mm_cmpgt_epi32 (
    1426                 :                 _mm_srli_epi32 (xmm_src, 24),
    1427                 :                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
    1428                 : 
    1429                 :         /* if some alpha src is grater than respective ~alpha dst */
    1430               0 :         if (pack_cmp)
    1431                 :         {
    1432               0 :             s = combine1 (ps++, pm);
    1433               0 :             d = *pd;
    1434               0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1435               0 :             if (pm)
    1436               0 :                 pm++;
    1437                 : 
    1438               0 :             s = combine1 (ps++, pm);
    1439               0 :             d = *pd;
    1440               0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1441               0 :             if (pm)
    1442               0 :                 pm++;
    1443                 : 
    1444               0 :             s = combine1 (ps++, pm);
    1445               0 :             d = *pd;
    1446               0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1447               0 :             if (pm)
    1448               0 :                 pm++;
    1449                 : 
    1450               0 :             s = combine1 (ps++, pm);
    1451               0 :             d = *pd;
    1452               0 :             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1453               0 :             if (pm)
    1454               0 :                 pm++;
    1455                 :         }
    1456                 :         else
    1457                 :         {
    1458               0 :             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
    1459                 : 
    1460               0 :             pd += 4;
    1461               0 :             ps += 4;
    1462               0 :             if (pm)
    1463               0 :                 pm += 4;
    1464                 :         }
    1465                 : 
    1466               0 :         w -= 4;
    1467                 :     }
    1468                 : 
    1469               0 :     while (w--)
    1470                 :     {
    1471                 :         s = combine1 (ps, pm);
    1472               0 :         d = *pd;
    1473                 : 
    1474               0 :         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
    1475               0 :         ps++;
    1476               0 :         if (pm)
    1477               0 :             pm++;
    1478                 :     }
    1479               0 : }
    1480                 : 
    1481                 : static void
    1482               0 : sse2_combine_src_ca (pixman_implementation_t *imp,
    1483                 :                      pixman_op_t              op,
    1484                 :                      uint32_t *               pd,
    1485                 :                      const uint32_t *         ps,
    1486                 :                      const uint32_t *         pm,
    1487                 :                      int                      w)
    1488                 : {
    1489                 :     uint32_t s, m;
    1490                 : 
    1491                 :     __m128i xmm_src_lo, xmm_src_hi;
    1492                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1493                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1494                 : 
    1495               0 :     while (w && (unsigned long)pd & 15)
    1496                 :     {
    1497               0 :         s = *ps++;
    1498               0 :         m = *pm++;
    1499               0 :         *pd++ = pack_1x128_32 (
    1500                 :             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
    1501               0 :         w--;
    1502                 :     }
    1503                 : 
    1504               0 :     while (w >= 4)
    1505                 :     {
    1506               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1507               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1508                 : 
    1509               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1510               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1511                 : 
    1512                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1513                 :                             &xmm_mask_lo, &xmm_mask_hi,
    1514                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1515                 : 
    1516               0 :         save_128_aligned (
    1517                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1518                 : 
    1519               0 :         ps += 4;
    1520               0 :         pd += 4;
    1521               0 :         pm += 4;
    1522               0 :         w -= 4;
    1523                 :     }
    1524                 : 
    1525               0 :     while (w)
    1526                 :     {
    1527               0 :         s = *ps++;
    1528               0 :         m = *pm++;
    1529               0 :         *pd++ = pack_1x128_32 (
    1530                 :             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
    1531               0 :         w--;
    1532                 :     }
    1533               0 : }
    1534                 : 
    1535                 : static force_inline uint32_t
    1536                 : core_combine_over_ca_pixel_sse2 (uint32_t src,
    1537                 :                                  uint32_t mask,
    1538                 :                                  uint32_t dst)
    1539                 : {
    1540               0 :     __m128i s = unpack_32_1x128 (src);
    1541               0 :     __m128i expAlpha = expand_alpha_1x128 (s);
    1542               0 :     __m128i unpk_mask = unpack_32_1x128 (mask);
    1543               0 :     __m128i unpk_dst  = unpack_32_1x128 (dst);
    1544                 : 
    1545                 :     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
    1546                 : }
    1547                 : 
    1548                 : static void
    1549               0 : sse2_combine_over_ca (pixman_implementation_t *imp,
    1550                 :                       pixman_op_t              op,
    1551                 :                       uint32_t *               pd,
    1552                 :                       const uint32_t *         ps,
    1553                 :                       const uint32_t *         pm,
    1554                 :                       int                      w)
    1555                 : {
    1556                 :     uint32_t s, m, d;
    1557                 : 
    1558                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1559                 :     __m128i xmm_src_lo, xmm_src_hi;
    1560                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1561                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1562                 : 
    1563               0 :     while (w && (unsigned long)pd & 15)
    1564                 :     {
    1565               0 :         s = *ps++;
    1566               0 :         m = *pm++;
    1567               0 :         d = *pd;
    1568                 : 
    1569               0 :         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
    1570               0 :         w--;
    1571                 :     }
    1572                 : 
    1573               0 :     while (w >= 4)
    1574                 :     {
    1575               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1576               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1577               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1578                 : 
    1579               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1580               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1581               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1582                 : 
    1583               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1584                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1585                 : 
    1586                 :         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    1587                 :                        &xmm_alpha_lo, &xmm_alpha_hi,
    1588                 :                        &xmm_mask_lo, &xmm_mask_hi,
    1589                 :                        &xmm_dst_lo, &xmm_dst_hi);
    1590                 : 
    1591               0 :         save_128_aligned (
    1592                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1593                 : 
    1594               0 :         ps += 4;
    1595               0 :         pd += 4;
    1596               0 :         pm += 4;
    1597               0 :         w -= 4;
    1598                 :     }
    1599                 : 
    1600               0 :     while (w)
    1601                 :     {
    1602               0 :         s = *ps++;
    1603               0 :         m = *pm++;
    1604               0 :         d = *pd;
    1605                 : 
    1606               0 :         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
    1607               0 :         w--;
    1608                 :     }
    1609               0 : }
    1610                 : 
    1611                 : static force_inline uint32_t
    1612                 : core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
    1613                 :                                          uint32_t mask,
    1614                 :                                          uint32_t dst)
    1615                 : {
    1616               0 :     __m128i d = unpack_32_1x128 (dst);
    1617                 : 
    1618               0 :     return pack_1x128_32 (
    1619                 :         over_1x128 (d, expand_alpha_1x128 (d),
    1620                 :                     pix_multiply_1x128 (unpack_32_1x128 (src),
    1621                 :                                         unpack_32_1x128 (mask))));
    1622                 : }
    1623                 : 
    1624                 : static void
    1625               0 : sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
    1626                 :                               pixman_op_t              op,
    1627                 :                               uint32_t *               pd,
    1628                 :                               const uint32_t *         ps,
    1629                 :                               const uint32_t *         pm,
    1630                 :                               int                      w)
    1631                 : {
    1632                 :     uint32_t s, m, d;
    1633                 : 
    1634                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1635                 :     __m128i xmm_src_lo, xmm_src_hi;
    1636                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1637                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1638                 : 
    1639               0 :     while (w && (unsigned long)pd & 15)
    1640                 :     {
    1641               0 :         s = *ps++;
    1642               0 :         m = *pm++;
    1643               0 :         d = *pd;
    1644                 : 
    1645               0 :         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
    1646               0 :         w--;
    1647                 :     }
    1648                 : 
    1649               0 :     while (w >= 4)
    1650                 :     {
    1651               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1652               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1653               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1654                 : 
    1655               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1656               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1657               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1658                 : 
    1659               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1660                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1661                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1662                 :                             &xmm_mask_lo, &xmm_mask_hi,
    1663                 :                             &xmm_mask_lo, &xmm_mask_hi);
    1664                 : 
    1665                 :         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1666                 :                     &xmm_alpha_lo, &xmm_alpha_hi,
    1667                 :                     &xmm_mask_lo, &xmm_mask_hi);
    1668                 : 
    1669               0 :         save_128_aligned (
    1670                 :             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
    1671                 : 
    1672               0 :         ps += 4;
    1673               0 :         pd += 4;
    1674               0 :         pm += 4;
    1675               0 :         w -= 4;
    1676                 :     }
    1677                 : 
    1678               0 :     while (w)
    1679                 :     {
    1680               0 :         s = *ps++;
    1681               0 :         m = *pm++;
    1682               0 :         d = *pd;
    1683                 : 
    1684               0 :         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
    1685               0 :         w--;
    1686                 :     }
    1687               0 : }
    1688                 : 
    1689                 : static void
    1690               0 : sse2_combine_in_ca (pixman_implementation_t *imp,
    1691                 :                     pixman_op_t              op,
    1692                 :                     uint32_t *               pd,
    1693                 :                     const uint32_t *         ps,
    1694                 :                     const uint32_t *         pm,
    1695                 :                     int                      w)
    1696                 : {
    1697                 :     uint32_t s, m, d;
    1698                 : 
    1699                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1700                 :     __m128i xmm_src_lo, xmm_src_hi;
    1701                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1702                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1703                 : 
    1704               0 :     while (w && (unsigned long)pd & 15)
    1705                 :     {
    1706               0 :         s = *ps++;
    1707               0 :         m = *pm++;
    1708               0 :         d = *pd;
    1709                 : 
    1710               0 :         *pd++ = pack_1x128_32 (
    1711                 :             pix_multiply_1x128 (
    1712                 :                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1713                 :                 expand_alpha_1x128 (unpack_32_1x128 (d))));
    1714                 : 
    1715               0 :         w--;
    1716                 :     }
    1717                 : 
    1718               0 :     while (w >= 4)
    1719                 :     {
    1720               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1721               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1722               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1723                 : 
    1724               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1725               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1726               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1727                 : 
    1728               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1729                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1730                 : 
    1731                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1732                 :                             &xmm_mask_lo, &xmm_mask_hi,
    1733                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1734                 : 
    1735                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1736                 :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1737                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1738                 : 
    1739               0 :         save_128_aligned (
    1740                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1741                 : 
    1742               0 :         ps += 4;
    1743               0 :         pd += 4;
    1744               0 :         pm += 4;
    1745               0 :         w -= 4;
    1746                 :     }
    1747                 : 
    1748               0 :     while (w)
    1749                 :     {
    1750               0 :         s = *ps++;
    1751               0 :         m = *pm++;
    1752               0 :         d = *pd;
    1753                 : 
    1754               0 :         *pd++ = pack_1x128_32 (
    1755                 :             pix_multiply_1x128 (
    1756                 :                 pix_multiply_1x128 (
    1757                 :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1758                 :                 expand_alpha_1x128 (unpack_32_1x128 (d))));
    1759                 : 
    1760               0 :         w--;
    1761                 :     }
    1762               0 : }
    1763                 : 
    1764                 : static void
    1765               0 : sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
    1766                 :                             pixman_op_t              op,
    1767                 :                             uint32_t *               pd,
    1768                 :                             const uint32_t *         ps,
    1769                 :                             const uint32_t *         pm,
    1770                 :                             int                      w)
    1771                 : {
    1772                 :     uint32_t s, m, d;
    1773                 : 
    1774                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1775                 :     __m128i xmm_src_lo, xmm_src_hi;
    1776                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1777                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1778                 : 
    1779               0 :     while (w && (unsigned long)pd & 15)
    1780                 :     {
    1781               0 :         s = *ps++;
    1782               0 :         m = *pm++;
    1783               0 :         d = *pd;
    1784                 : 
    1785               0 :         *pd++ = pack_1x128_32 (
    1786                 :             pix_multiply_1x128 (
    1787                 :                 unpack_32_1x128 (d),
    1788                 :                 pix_multiply_1x128 (unpack_32_1x128 (m),
    1789                 :                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    1790               0 :         w--;
    1791                 :     }
    1792                 : 
    1793               0 :     while (w >= 4)
    1794                 :     {
    1795               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1796               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1797               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1798                 : 
    1799               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1800               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1801               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1802                 : 
    1803               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1804                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1805                 :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    1806                 :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1807                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1808                 : 
    1809                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1810                 :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1811                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1812                 : 
    1813               0 :         save_128_aligned (
    1814                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1815                 : 
    1816               0 :         ps += 4;
    1817               0 :         pd += 4;
    1818               0 :         pm += 4;
    1819               0 :         w -= 4;
    1820                 :     }
    1821                 : 
    1822               0 :     while (w)
    1823                 :     {
    1824               0 :         s = *ps++;
    1825               0 :         m = *pm++;
    1826               0 :         d = *pd;
    1827                 : 
    1828               0 :         *pd++ = pack_1x128_32 (
    1829                 :             pix_multiply_1x128 (
    1830                 :                 unpack_32_1x128 (d),
    1831                 :                 pix_multiply_1x128 (unpack_32_1x128 (m),
    1832                 :                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    1833               0 :         w--;
    1834                 :     }
    1835               0 : }
    1836                 : 
    1837                 : static void
    1838               0 : sse2_combine_out_ca (pixman_implementation_t *imp,
    1839                 :                      pixman_op_t              op,
    1840                 :                      uint32_t *               pd,
    1841                 :                      const uint32_t *         ps,
    1842                 :                      const uint32_t *         pm,
    1843                 :                      int                      w)
    1844                 : {
    1845                 :     uint32_t s, m, d;
    1846                 : 
    1847                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1848                 :     __m128i xmm_src_lo, xmm_src_hi;
    1849                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1850                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1851                 : 
    1852               0 :     while (w && (unsigned long)pd & 15)
    1853                 :     {
    1854               0 :         s = *ps++;
    1855               0 :         m = *pm++;
    1856               0 :         d = *pd;
    1857                 : 
    1858               0 :         *pd++ = pack_1x128_32 (
    1859                 :             pix_multiply_1x128 (
    1860                 :                 pix_multiply_1x128 (
    1861                 :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1862                 :                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1863               0 :         w--;
    1864                 :     }
    1865                 : 
    1866               0 :     while (w >= 4)
    1867                 :     {
    1868               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1869               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1870               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1871                 : 
    1872               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1873               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1874               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1875                 : 
    1876               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    1877                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1878               0 :         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
    1879                 :                       &xmm_alpha_lo, &xmm_alpha_hi);
    1880                 : 
    1881                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    1882                 :                             &xmm_mask_lo, &xmm_mask_hi,
    1883                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1884                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1885                 :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1886                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1887                 : 
    1888               0 :         save_128_aligned (
    1889                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1890                 : 
    1891               0 :         ps += 4;
    1892               0 :         pd += 4;
    1893               0 :         pm += 4;
    1894               0 :         w -= 4;
    1895                 :     }
    1896                 : 
    1897               0 :     while (w)
    1898                 :     {
    1899               0 :         s = *ps++;
    1900               0 :         m = *pm++;
    1901               0 :         d = *pd;
    1902                 : 
    1903               0 :         *pd++ = pack_1x128_32 (
    1904                 :             pix_multiply_1x128 (
    1905                 :                 pix_multiply_1x128 (
    1906                 :                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
    1907                 :                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
    1908                 : 
    1909               0 :         w--;
    1910                 :     }
    1911               0 : }
    1912                 : 
    1913                 : static void
    1914               0 : sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
    1915                 :                              pixman_op_t              op,
    1916                 :                              uint32_t *               pd,
    1917                 :                              const uint32_t *         ps,
    1918                 :                              const uint32_t *         pm,
    1919                 :                              int                      w)
    1920                 : {
    1921                 :     uint32_t s, m, d;
    1922                 : 
    1923                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    1924                 :     __m128i xmm_src_lo, xmm_src_hi;
    1925                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    1926                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    1927                 : 
    1928               0 :     while (w && (unsigned long)pd & 15)
    1929                 :     {
    1930               0 :         s = *ps++;
    1931               0 :         m = *pm++;
    1932               0 :         d = *pd;
    1933                 : 
    1934               0 :         *pd++ = pack_1x128_32 (
    1935                 :             pix_multiply_1x128 (
    1936                 :                 unpack_32_1x128 (d),
    1937                 :                 negate_1x128 (pix_multiply_1x128 (
    1938                 :                                  unpack_32_1x128 (m),
    1939                 :                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
    1940               0 :         w--;
    1941                 :     }
    1942                 : 
    1943               0 :     while (w >= 4)
    1944                 :     {
    1945               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    1946               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    1947               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    1948                 : 
    1949               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    1950               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    1951               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    1952                 : 
    1953               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    1954                 :                             &xmm_alpha_lo, &xmm_alpha_hi);
    1955                 : 
    1956                 :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    1957                 :                             &xmm_alpha_lo, &xmm_alpha_hi,
    1958                 :                             &xmm_mask_lo, &xmm_mask_hi);
    1959                 : 
    1960               0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
    1961                 :                       &xmm_mask_lo, &xmm_mask_hi);
    1962                 : 
    1963                 :         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    1964                 :                             &xmm_mask_lo, &xmm_mask_hi,
    1965                 :                             &xmm_dst_lo, &xmm_dst_hi);
    1966                 : 
    1967               0 :         save_128_aligned (
    1968                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    1969                 : 
    1970               0 :         ps += 4;
    1971               0 :         pd += 4;
    1972               0 :         pm += 4;
    1973               0 :         w -= 4;
    1974                 :     }
    1975                 : 
    1976               0 :     while (w)
    1977                 :     {
    1978               0 :         s = *ps++;
    1979               0 :         m = *pm++;
    1980               0 :         d = *pd;
    1981                 : 
    1982               0 :         *pd++ = pack_1x128_32 (
    1983                 :             pix_multiply_1x128 (
    1984                 :                 unpack_32_1x128 (d),
    1985                 :                 negate_1x128 (pix_multiply_1x128 (
    1986                 :                                  unpack_32_1x128 (m),
    1987                 :                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
    1988               0 :         w--;
    1989                 :     }
    1990               0 : }
    1991                 : 
    1992                 : static force_inline uint32_t
    1993                 : core_combine_atop_ca_pixel_sse2 (uint32_t src,
    1994                 :                                  uint32_t mask,
    1995                 :                                  uint32_t dst)
    1996                 : {
    1997               0 :     __m128i m = unpack_32_1x128 (mask);
    1998               0 :     __m128i s = unpack_32_1x128 (src);
    1999               0 :     __m128i d = unpack_32_1x128 (dst);
    2000               0 :     __m128i sa = expand_alpha_1x128 (s);
    2001               0 :     __m128i da = expand_alpha_1x128 (d);
    2002                 : 
    2003               0 :     s = pix_multiply_1x128 (s, m);
    2004               0 :     m = negate_1x128 (pix_multiply_1x128 (m, sa));
    2005                 : 
    2006                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
    2007                 : }
    2008                 : 
    2009                 : static void
    2010               0 : sse2_combine_atop_ca (pixman_implementation_t *imp,
    2011                 :                       pixman_op_t              op,
    2012                 :                       uint32_t *               pd,
    2013                 :                       const uint32_t *         ps,
    2014                 :                       const uint32_t *         pm,
    2015                 :                       int                      w)
    2016                 : {
    2017                 :     uint32_t s, m, d;
    2018                 : 
    2019                 :     __m128i xmm_src_lo, xmm_src_hi;
    2020                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    2021                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2022                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2023                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    2024                 : 
    2025               0 :     while (w && (unsigned long)pd & 15)
    2026                 :     {
    2027               0 :         s = *ps++;
    2028               0 :         m = *pm++;
    2029               0 :         d = *pd;
    2030                 : 
    2031               0 :         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
    2032               0 :         w--;
    2033                 :     }
    2034                 : 
    2035               0 :     while (w >= 4)
    2036                 :     {
    2037               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2038               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2039               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2040                 : 
    2041               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2042               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2043               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2044                 : 
    2045               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2046                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2047               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2048                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2049                 : 
    2050                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2051                 :                             &xmm_mask_lo, &xmm_mask_hi,
    2052                 :                             &xmm_src_lo, &xmm_src_hi);
    2053                 :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2054                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2055                 :                             &xmm_mask_lo, &xmm_mask_hi);
    2056                 : 
    2057               0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2058                 : 
    2059                 :         pix_add_multiply_2x128 (
    2060                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2061                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2062                 :             &xmm_dst_lo, &xmm_dst_hi);
    2063                 : 
    2064               0 :         save_128_aligned (
    2065                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2066                 : 
    2067               0 :         ps += 4;
    2068               0 :         pd += 4;
    2069               0 :         pm += 4;
    2070               0 :         w -= 4;
    2071                 :     }
    2072                 : 
    2073               0 :     while (w)
    2074                 :     {
    2075               0 :         s = *ps++;
    2076               0 :         m = *pm++;
    2077               0 :         d = *pd;
    2078                 : 
    2079               0 :         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
    2080               0 :         w--;
    2081                 :     }
    2082               0 : }
    2083                 : 
    2084                 : static force_inline uint32_t
    2085                 : core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
    2086                 :                                          uint32_t mask,
    2087                 :                                          uint32_t dst)
    2088                 : {
    2089               0 :     __m128i m = unpack_32_1x128 (mask);
    2090               0 :     __m128i s = unpack_32_1x128 (src);
    2091               0 :     __m128i d = unpack_32_1x128 (dst);
    2092                 : 
    2093               0 :     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
    2094               0 :     __m128i sa = expand_alpha_1x128 (s);
    2095                 : 
    2096               0 :     s = pix_multiply_1x128 (s, m);
    2097               0 :     m = pix_multiply_1x128 (m, sa);
    2098                 : 
    2099                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
    2100                 : }
    2101                 : 
    2102                 : static void
    2103               0 : sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
    2104                 :                               pixman_op_t              op,
    2105                 :                               uint32_t *               pd,
    2106                 :                               const uint32_t *         ps,
    2107                 :                               const uint32_t *         pm,
    2108                 :                               int                      w)
    2109                 : {
    2110                 :     uint32_t s, m, d;
    2111                 : 
    2112                 :     __m128i xmm_src_lo, xmm_src_hi;
    2113                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    2114                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2115                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2116                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    2117                 : 
    2118               0 :     while (w && (unsigned long)pd & 15)
    2119                 :     {
    2120               0 :         s = *ps++;
    2121               0 :         m = *pm++;
    2122               0 :         d = *pd;
    2123                 : 
    2124               0 :         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
    2125               0 :         w--;
    2126                 :     }
    2127                 : 
    2128               0 :     while (w >= 4)
    2129                 :     {
    2130               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2131               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2132               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2133                 : 
    2134               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2135               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2136               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2137                 : 
    2138               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2139                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2140               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2141                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2142                 : 
    2143                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2144                 :                             &xmm_mask_lo, &xmm_mask_hi,
    2145                 :                             &xmm_src_lo, &xmm_src_hi);
    2146                 :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2147                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2148                 :                             &xmm_mask_lo, &xmm_mask_hi);
    2149                 : 
    2150               0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    2151                 :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2152                 : 
    2153                 :         pix_add_multiply_2x128 (
    2154                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2155                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2156                 :             &xmm_dst_lo, &xmm_dst_hi);
    2157                 : 
    2158               0 :         save_128_aligned (
    2159                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2160                 : 
    2161               0 :         ps += 4;
    2162               0 :         pd += 4;
    2163               0 :         pm += 4;
    2164               0 :         w -= 4;
    2165                 :     }
    2166                 : 
    2167               0 :     while (w)
    2168                 :     {
    2169               0 :         s = *ps++;
    2170               0 :         m = *pm++;
    2171               0 :         d = *pd;
    2172                 : 
    2173               0 :         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
    2174               0 :         w--;
    2175                 :     }
    2176               0 : }
    2177                 : 
    2178                 : static force_inline uint32_t
    2179                 : core_combine_xor_ca_pixel_sse2 (uint32_t src,
    2180                 :                                 uint32_t mask,
    2181                 :                                 uint32_t dst)
    2182                 : {
    2183               0 :     __m128i a = unpack_32_1x128 (mask);
    2184               0 :     __m128i s = unpack_32_1x128 (src);
    2185               0 :     __m128i d = unpack_32_1x128 (dst);
    2186                 : 
    2187               0 :     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
    2188                 :                                        a, expand_alpha_1x128 (s)));
    2189               0 :     __m128i dest      = pix_multiply_1x128 (s, a);
    2190               0 :     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
    2191                 : 
    2192                 :     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
    2193                 :                                                 &alpha_dst,
    2194                 :                                                 &dest,
    2195                 :                                                 &alpha_src));
    2196                 : }
    2197                 : 
    2198                 : static void
    2199               0 : sse2_combine_xor_ca (pixman_implementation_t *imp,
    2200                 :                      pixman_op_t              op,
    2201                 :                      uint32_t *               pd,
    2202                 :                      const uint32_t *         ps,
    2203                 :                      const uint32_t *         pm,
    2204                 :                      int                      w)
    2205                 : {
    2206                 :     uint32_t s, m, d;
    2207                 : 
    2208                 :     __m128i xmm_src_lo, xmm_src_hi;
    2209                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    2210                 :     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
    2211                 :     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
    2212                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    2213                 : 
    2214               0 :     while (w && (unsigned long)pd & 15)
    2215                 :     {
    2216               0 :         s = *ps++;
    2217               0 :         m = *pm++;
    2218               0 :         d = *pd;
    2219                 : 
    2220               0 :         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
    2221               0 :         w--;
    2222                 :     }
    2223                 : 
    2224               0 :     while (w >= 4)
    2225                 :     {
    2226               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2227               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2228               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2229                 : 
    2230               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2231               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2232               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2233                 : 
    2234               0 :         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2235                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
    2236               0 :         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    2237                 :                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2238                 : 
    2239                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2240                 :                             &xmm_mask_lo, &xmm_mask_hi,
    2241                 :                             &xmm_src_lo, &xmm_src_hi);
    2242                 :         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    2243                 :                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
    2244                 :                             &xmm_mask_lo, &xmm_mask_hi);
    2245                 : 
    2246               0 :         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
    2247                 :                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
    2248               0 :         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
    2249                 :                       &xmm_mask_lo, &xmm_mask_hi);
    2250                 : 
    2251                 :         pix_add_multiply_2x128 (
    2252                 :             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
    2253                 :             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
    2254                 :             &xmm_dst_lo, &xmm_dst_hi);
    2255                 : 
    2256               0 :         save_128_aligned (
    2257                 :             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2258                 : 
    2259               0 :         ps += 4;
    2260               0 :         pd += 4;
    2261               0 :         pm += 4;
    2262               0 :         w -= 4;
    2263                 :     }
    2264                 : 
    2265               0 :     while (w)
    2266                 :     {
    2267               0 :         s = *ps++;
    2268               0 :         m = *pm++;
    2269               0 :         d = *pd;
    2270                 : 
    2271               0 :         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
    2272               0 :         w--;
    2273                 :     }
    2274               0 : }
    2275                 : 
    2276                 : static void
    2277               0 : sse2_combine_add_ca (pixman_implementation_t *imp,
    2278                 :                      pixman_op_t              op,
    2279                 :                      uint32_t *               pd,
    2280                 :                      const uint32_t *         ps,
    2281                 :                      const uint32_t *         pm,
    2282                 :                      int                      w)
    2283                 : {
    2284                 :     uint32_t s, m, d;
    2285                 : 
    2286                 :     __m128i xmm_src_lo, xmm_src_hi;
    2287                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    2288                 :     __m128i xmm_mask_lo, xmm_mask_hi;
    2289                 : 
    2290               0 :     while (w && (unsigned long)pd & 15)
    2291                 :     {
    2292               0 :         s = *ps++;
    2293               0 :         m = *pm++;
    2294               0 :         d = *pd;
    2295                 : 
    2296               0 :         *pd++ = pack_1x128_32 (
    2297                 :             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
    2298                 :                                                unpack_32_1x128 (m)),
    2299                 :                            unpack_32_1x128 (d)));
    2300               0 :         w--;
    2301                 :     }
    2302                 : 
    2303               0 :     while (w >= 4)
    2304                 :     {
    2305               0 :         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
    2306               0 :         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
    2307               0 :         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
    2308                 : 
    2309               0 :         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    2310               0 :         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    2311               0 :         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    2312                 : 
    2313                 :         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    2314                 :                             &xmm_mask_lo, &xmm_mask_hi,
    2315                 :                             &xmm_src_lo, &xmm_src_hi);
    2316                 : 
    2317               0 :         save_128_aligned (
    2318                 :             (__m128i*)pd, pack_2x128_128 (
    2319                 :                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
    2320                 :                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
    2321                 : 
    2322               0 :         ps += 4;
    2323               0 :         pd += 4;
    2324               0 :         pm += 4;
    2325               0 :         w -= 4;
    2326                 :     }
    2327                 : 
    2328               0 :     while (w)
    2329                 :     {
    2330               0 :         s = *ps++;
    2331               0 :         m = *pm++;
    2332               0 :         d = *pd;
    2333                 : 
    2334               0 :         *pd++ = pack_1x128_32 (
    2335                 :             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
    2336                 :                                                unpack_32_1x128 (m)),
    2337                 :                            unpack_32_1x128 (d)));
    2338               0 :         w--;
    2339                 :     }
    2340               0 : }
    2341                 : 
    2342                 : static force_inline __m128i
    2343                 : create_mask_16_128 (uint16_t mask)
    2344                 : {
    2345              16 :     return _mm_set1_epi16 (mask);
    2346                 : }
    2347                 : 
    2348                 : /* Work around a code generation bug in Sun Studio 12. */
    2349                 : #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
    2350                 : # define create_mask_2x32_128(mask0, mask1)                             \
    2351                 :     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
    2352                 : #else
    2353                 : static force_inline __m128i
    2354                 : create_mask_2x32_128 (uint32_t mask0,
    2355                 :                       uint32_t mask1)
    2356                 : {
    2357              61 :     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
    2358                 : }
    2359                 : #endif
    2360                 : 
    2361                 : static void
    2362               0 : sse2_composite_over_n_8888 (pixman_implementation_t *imp,
    2363                 :                             pixman_op_t              op,
    2364                 :                             pixman_image_t *         src_image,
    2365                 :                             pixman_image_t *         mask_image,
    2366                 :                             pixman_image_t *         dst_image,
    2367                 :                             int32_t                  src_x,
    2368                 :                             int32_t                  src_y,
    2369                 :                             int32_t                  mask_x,
    2370                 :                             int32_t                  mask_y,
    2371                 :                             int32_t                  dest_x,
    2372                 :                             int32_t                  dest_y,
    2373                 :                             int32_t                  width,
    2374                 :                             int32_t                  height)
    2375                 : {
    2376                 :     uint32_t src;
    2377                 :     uint32_t    *dst_line, *dst, d;
    2378                 :     int32_t w;
    2379                 :     int dst_stride;
    2380                 :     __m128i xmm_src, xmm_alpha;
    2381                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2382                 : 
    2383               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2384                 : 
    2385               0 :     if (src == 0)
    2386               0 :         return;
    2387                 : 
    2388               0 :     PIXMAN_IMAGE_GET_LINE (
    2389                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2390                 : 
    2391               0 :     xmm_src = expand_pixel_32_1x128 (src);
    2392               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2393                 : 
    2394               0 :     while (height--)
    2395                 :     {
    2396               0 :         dst = dst_line;
    2397                 : 
    2398               0 :         dst_line += dst_stride;
    2399               0 :         w = width;
    2400                 : 
    2401               0 :         while (w && (unsigned long)dst & 15)
    2402                 :         {
    2403               0 :             d = *dst;
    2404               0 :             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
    2405                 :                                                 xmm_alpha,
    2406                 :                                                 unpack_32_1x128 (d)));
    2407               0 :             w--;
    2408                 :         }
    2409                 : 
    2410               0 :         while (w >= 4)
    2411                 :         {
    2412               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    2413                 : 
    2414               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2415                 : 
    2416                 :             over_2x128 (&xmm_src, &xmm_src,
    2417                 :                         &xmm_alpha, &xmm_alpha,
    2418                 :                         &xmm_dst_lo, &xmm_dst_hi);
    2419                 : 
    2420                 :             /* rebuid the 4 pixel data and save*/
    2421               0 :             save_128_aligned (
    2422                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2423                 : 
    2424               0 :             w -= 4;
    2425               0 :             dst += 4;
    2426                 :         }
    2427                 : 
    2428               0 :         while (w)
    2429                 :         {
    2430               0 :             d = *dst;
    2431               0 :             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
    2432                 :                                                 xmm_alpha,
    2433                 :                                                 unpack_32_1x128 (d)));
    2434               0 :             w--;
    2435                 :         }
    2436                 : 
    2437                 :     }
    2438                 : }
    2439                 : 
    2440                 : static void
    2441               0 : sse2_composite_over_n_0565 (pixman_implementation_t *imp,
    2442                 :                             pixman_op_t              op,
    2443                 :                             pixman_image_t *         src_image,
    2444                 :                             pixman_image_t *         mask_image,
    2445                 :                             pixman_image_t *         dst_image,
    2446                 :                             int32_t                  src_x,
    2447                 :                             int32_t                  src_y,
    2448                 :                             int32_t                  mask_x,
    2449                 :                             int32_t                  mask_y,
    2450                 :                             int32_t                  dest_x,
    2451                 :                             int32_t                  dest_y,
    2452                 :                             int32_t                  width,
    2453                 :                             int32_t                  height)
    2454                 : {
    2455                 :     uint32_t src;
    2456                 :     uint16_t    *dst_line, *dst, d;
    2457                 :     int32_t w;
    2458                 :     int dst_stride;
    2459                 :     __m128i xmm_src, xmm_alpha;
    2460                 :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    2461                 : 
    2462               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2463                 : 
    2464               0 :     if (src == 0)
    2465               0 :         return;
    2466                 : 
    2467               0 :     PIXMAN_IMAGE_GET_LINE (
    2468                 :         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    2469                 : 
    2470               0 :     xmm_src = expand_pixel_32_1x128 (src);
    2471               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2472                 : 
    2473               0 :     while (height--)
    2474                 :     {
    2475               0 :         dst = dst_line;
    2476                 : 
    2477               0 :         dst_line += dst_stride;
    2478               0 :         w = width;
    2479                 : 
    2480               0 :         while (w && (unsigned long)dst & 15)
    2481                 :         {
    2482               0 :             d = *dst;
    2483                 : 
    2484               0 :             *dst++ = pack_565_32_16 (
    2485                 :                 pack_1x128_32 (over_1x128 (xmm_src,
    2486                 :                                            xmm_alpha,
    2487                 :                                            expand565_16_1x128 (d))));
    2488               0 :             w--;
    2489                 :         }
    2490                 : 
    2491               0 :         while (w >= 8)
    2492                 :         {
    2493               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    2494                 : 
    2495               0 :             unpack_565_128_4x128 (xmm_dst,
    2496                 :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    2497                 : 
    2498                 :             over_2x128 (&xmm_src, &xmm_src,
    2499                 :                         &xmm_alpha, &xmm_alpha,
    2500                 :                         &xmm_dst0, &xmm_dst1);
    2501                 :             over_2x128 (&xmm_src, &xmm_src,
    2502                 :                         &xmm_alpha, &xmm_alpha,
    2503                 :                         &xmm_dst2, &xmm_dst3);
    2504                 : 
    2505               0 :             xmm_dst = pack_565_4x128_128 (
    2506                 :                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    2507                 : 
    2508               0 :             save_128_aligned ((__m128i*)dst, xmm_dst);
    2509                 : 
    2510               0 :             dst += 8;
    2511               0 :             w -= 8;
    2512                 :         }
    2513                 : 
    2514               0 :         while (w--)
    2515                 :         {
    2516               0 :             d = *dst;
    2517               0 :             *dst++ = pack_565_32_16 (
    2518                 :                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
    2519                 :                                            expand565_16_1x128 (d))));
    2520                 :         }
    2521                 :     }
    2522                 : 
    2523                 : }
    2524                 : 
    2525                 : static void
    2526               0 : sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
    2527                 :                                    pixman_op_t              op,
    2528                 :                                    pixman_image_t *         src_image,
    2529                 :                                    pixman_image_t *         mask_image,
    2530                 :                                    pixman_image_t *         dst_image,
    2531                 :                                    int32_t                  src_x,
    2532                 :                                    int32_t                  src_y,
    2533                 :                                    int32_t                  mask_x,
    2534                 :                                    int32_t                  mask_y,
    2535                 :                                    int32_t                  dest_x,
    2536                 :                                    int32_t                  dest_y,
    2537                 :                                    int32_t                  width,
    2538                 :                                    int32_t                  height)
    2539                 : {
    2540                 :     uint32_t src, srca;
    2541                 :     uint32_t    *dst_line, d;
    2542                 :     uint32_t    *mask_line, m;
    2543                 :     uint32_t pack_cmp;
    2544                 :     int dst_stride, mask_stride;
    2545                 : 
    2546                 :     __m128i xmm_src, xmm_alpha;
    2547                 :     __m128i xmm_dst;
    2548                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    2549                 : 
    2550                 :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    2551                 : 
    2552               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2553               0 :     srca = src >> 24;
    2554                 : 
    2555               0 :     if (src == 0)
    2556               0 :         return;
    2557                 : 
    2558               0 :     PIXMAN_IMAGE_GET_LINE (
    2559                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2560               0 :     PIXMAN_IMAGE_GET_LINE (
    2561                 :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    2562                 : 
    2563               0 :     xmm_src = _mm_unpacklo_epi8 (
    2564                 :         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
    2565               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2566               0 :     mmx_src   = xmm_src;
    2567               0 :     mmx_alpha = xmm_alpha;
    2568                 : 
    2569               0 :     while (height--)
    2570                 :     {
    2571               0 :         int w = width;
    2572               0 :         const uint32_t *pm = (uint32_t *)mask_line;
    2573               0 :         uint32_t *pd = (uint32_t *)dst_line;
    2574                 : 
    2575               0 :         dst_line += dst_stride;
    2576               0 :         mask_line += mask_stride;
    2577                 : 
    2578               0 :         while (w && (unsigned long)pd & 15)
    2579                 :         {
    2580               0 :             m = *pm++;
    2581                 : 
    2582               0 :             if (m)
    2583                 :             {
    2584               0 :                 d = *pd;
    2585                 : 
    2586               0 :                 mmx_mask = unpack_32_1x128 (m);
    2587               0 :                 mmx_dest = unpack_32_1x128 (d);
    2588                 : 
    2589               0 :                 *pd = pack_1x128_32 (
    2590                 :                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
    2591                 :                                    mmx_dest));
    2592                 :             }
    2593                 : 
    2594               0 :             pd++;
    2595               0 :             w--;
    2596                 :         }
    2597                 : 
    2598               0 :         while (w >= 4)
    2599                 :         {
    2600               0 :             xmm_mask = load_128_unaligned ((__m128i*)pm);
    2601                 : 
    2602               0 :             pack_cmp =
    2603               0 :                 _mm_movemask_epi8 (
    2604                 :                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    2605                 : 
    2606                 :             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
    2607               0 :             if (pack_cmp != 0xffff)
    2608                 :             {
    2609               0 :                 xmm_dst = load_128_aligned ((__m128i*)pd);
    2610                 : 
    2611               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    2612                 : 
    2613                 :                 pix_multiply_2x128 (&xmm_src, &xmm_src,
    2614                 :                                     &xmm_mask_lo, &xmm_mask_hi,
    2615                 :                                     &xmm_mask_lo, &xmm_mask_hi);
    2616               0 :                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
    2617                 : 
    2618               0 :                 save_128_aligned (
    2619                 :                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
    2620                 :             }
    2621                 : 
    2622               0 :             pd += 4;
    2623               0 :             pm += 4;
    2624               0 :             w -= 4;
    2625                 :         }
    2626                 : 
    2627               0 :         while (w)
    2628                 :         {
    2629               0 :             m = *pm++;
    2630                 : 
    2631               0 :             if (m)
    2632                 :             {
    2633               0 :                 d = *pd;
    2634                 : 
    2635               0 :                 mmx_mask = unpack_32_1x128 (m);
    2636               0 :                 mmx_dest = unpack_32_1x128 (d);
    2637                 : 
    2638               0 :                 *pd = pack_1x128_32 (
    2639                 :                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
    2640                 :                                    mmx_dest));
    2641                 :             }
    2642                 : 
    2643               0 :             pd++;
    2644               0 :             w--;
    2645                 :         }
    2646                 :     }
    2647                 : 
    2648                 : }
    2649                 : 
    2650                 : static void
    2651               0 : sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
    2652                 :                                     pixman_op_t              op,
    2653                 :                                     pixman_image_t *         src_image,
    2654                 :                                     pixman_image_t *         mask_image,
    2655                 :                                     pixman_image_t *         dst_image,
    2656                 :                                     int32_t                  src_x,
    2657                 :                                     int32_t                  src_y,
    2658                 :                                     int32_t                  mask_x,
    2659                 :                                     int32_t                  mask_y,
    2660                 :                                     int32_t                  dest_x,
    2661                 :                                     int32_t                  dest_y,
    2662                 :                                     int32_t                  width,
    2663                 :                                     int32_t                  height)
    2664                 : {
    2665                 :     uint32_t src;
    2666                 :     uint32_t    *dst_line, d;
    2667                 :     uint32_t    *mask_line, m;
    2668                 :     uint32_t pack_cmp;
    2669                 :     int dst_stride, mask_stride;
    2670                 : 
    2671                 :     __m128i xmm_src, xmm_alpha;
    2672                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2673                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    2674                 : 
    2675                 :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    2676                 : 
    2677               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    2678                 : 
    2679               0 :     if (src == 0)
    2680               0 :         return;
    2681                 : 
    2682               0 :     PIXMAN_IMAGE_GET_LINE (
    2683                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2684               0 :     PIXMAN_IMAGE_GET_LINE (
    2685                 :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    2686                 : 
    2687               0 :     xmm_src = _mm_unpacklo_epi8 (
    2688                 :         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
    2689               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    2690               0 :     mmx_src   = xmm_src;
    2691               0 :     mmx_alpha = xmm_alpha;
    2692                 : 
    2693               0 :     while (height--)
    2694                 :     {
    2695               0 :         int w = width;
    2696               0 :         const uint32_t *pm = (uint32_t *)mask_line;
    2697               0 :         uint32_t *pd = (uint32_t *)dst_line;
    2698                 : 
    2699               0 :         dst_line += dst_stride;
    2700               0 :         mask_line += mask_stride;
    2701                 : 
    2702               0 :         while (w && (unsigned long)pd & 15)
    2703                 :         {
    2704               0 :             m = *pm++;
    2705                 : 
    2706               0 :             if (m)
    2707                 :             {
    2708               0 :                 d = *pd;
    2709               0 :                 mmx_mask = unpack_32_1x128 (m);
    2710               0 :                 mmx_dest = unpack_32_1x128 (d);
    2711                 : 
    2712               0 :                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
    2713                 :                                                   &mmx_alpha,
    2714                 :                                                   &mmx_mask,
    2715                 :                                                   &mmx_dest));
    2716                 :             }
    2717                 : 
    2718               0 :             pd++;
    2719               0 :             w--;
    2720                 :         }
    2721                 : 
    2722               0 :         while (w >= 4)
    2723                 :         {
    2724               0 :             xmm_mask = load_128_unaligned ((__m128i*)pm);
    2725                 : 
    2726               0 :             pack_cmp =
    2727               0 :                 _mm_movemask_epi8 (
    2728                 :                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    2729                 : 
    2730                 :             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
    2731               0 :             if (pack_cmp != 0xffff)
    2732                 :             {
    2733               0 :                 xmm_dst = load_128_aligned ((__m128i*)pd);
    2734                 : 
    2735               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    2736               0 :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2737                 : 
    2738                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    2739                 :                                &xmm_alpha, &xmm_alpha,
    2740                 :                                &xmm_mask_lo, &xmm_mask_hi,
    2741                 :                                &xmm_dst_lo, &xmm_dst_hi);
    2742                 : 
    2743               0 :                 save_128_aligned (
    2744                 :                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2745                 :             }
    2746                 : 
    2747               0 :             pd += 4;
    2748               0 :             pm += 4;
    2749               0 :             w -= 4;
    2750                 :         }
    2751                 : 
    2752               0 :         while (w)
    2753                 :         {
    2754               0 :             m = *pm++;
    2755                 : 
    2756               0 :             if (m)
    2757                 :             {
    2758               0 :                 d = *pd;
    2759               0 :                 mmx_mask = unpack_32_1x128 (m);
    2760               0 :                 mmx_dest = unpack_32_1x128 (d);
    2761                 : 
    2762               0 :                 *pd = pack_1x128_32 (
    2763                 :                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
    2764                 :             }
    2765                 : 
    2766               0 :             pd++;
    2767               0 :             w--;
    2768                 :         }
    2769                 :     }
    2770                 : 
    2771                 : }
    2772                 : 
    2773                 : static void
    2774               0 : sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
    2775                 :                                  pixman_op_t              op,
    2776                 :                                  pixman_image_t *         src_image,
    2777                 :                                  pixman_image_t *         mask_image,
    2778                 :                                  pixman_image_t *         dst_image,
    2779                 :                                  int32_t                  src_x,
    2780                 :                                  int32_t                  src_y,
    2781                 :                                  int32_t                  mask_x,
    2782                 :                                  int32_t                  mask_y,
    2783                 :                                  int32_t                  dest_x,
    2784                 :                                  int32_t                  dest_y,
    2785                 :                                  int32_t                  width,
    2786                 :                                  int32_t                  height)
    2787                 : {
    2788                 :     uint32_t    *dst_line, *dst;
    2789                 :     uint32_t    *src_line, *src;
    2790                 :     uint32_t mask;
    2791                 :     int32_t w;
    2792                 :     int dst_stride, src_stride;
    2793                 : 
    2794                 :     __m128i xmm_mask;
    2795                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    2796                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2797                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    2798                 : 
    2799               0 :     PIXMAN_IMAGE_GET_LINE (
    2800                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2801               0 :     PIXMAN_IMAGE_GET_LINE (
    2802                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2803                 : 
    2804               0 :     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
    2805                 : 
    2806               0 :     xmm_mask = create_mask_16_128 (mask >> 24);
    2807                 : 
    2808               0 :     while (height--)
    2809                 :     {
    2810               0 :         dst = dst_line;
    2811               0 :         dst_line += dst_stride;
    2812               0 :         src = src_line;
    2813               0 :         src_line += src_stride;
    2814               0 :         w = width;
    2815                 : 
    2816               0 :         while (w && (unsigned long)dst & 15)
    2817                 :         {
    2818               0 :             uint32_t s = *src++;
    2819                 : 
    2820               0 :             if (s)
    2821                 :             {
    2822               0 :                 uint32_t d = *dst;
    2823                 :                 
    2824               0 :                 __m128i ms = unpack_32_1x128 (s);
    2825               0 :                 __m128i alpha    = expand_alpha_1x128 (ms);
    2826               0 :                 __m128i dest     = xmm_mask;
    2827               0 :                 __m128i alpha_dst = unpack_32_1x128 (d);
    2828                 :                 
    2829               0 :                 *dst = pack_1x128_32 (
    2830                 :                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    2831                 :             }
    2832               0 :             dst++;
    2833               0 :             w--;
    2834                 :         }
    2835                 : 
    2836               0 :         while (w >= 4)
    2837                 :         {
    2838               0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    2839                 : 
    2840               0 :             if (!is_zero (xmm_src))
    2841                 :             {
    2842               0 :                 xmm_dst = load_128_aligned ((__m128i*)dst);
    2843                 :                 
    2844               0 :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    2845               0 :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    2846               0 :                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    2847                 :                                     &xmm_alpha_lo, &xmm_alpha_hi);
    2848                 :                 
    2849                 :                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    2850                 :                                &xmm_alpha_lo, &xmm_alpha_hi,
    2851                 :                                &xmm_mask, &xmm_mask,
    2852                 :                                &xmm_dst_lo, &xmm_dst_hi);
    2853                 :                 
    2854               0 :                 save_128_aligned (
    2855                 :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    2856                 :             }
    2857                 :                 
    2858               0 :             dst += 4;
    2859               0 :             src += 4;
    2860               0 :             w -= 4;
    2861                 :         }
    2862                 : 
    2863               0 :         while (w)
    2864                 :         {
    2865               0 :             uint32_t s = *src++;
    2866                 : 
    2867               0 :             if (s)
    2868                 :             {
    2869               0 :                 uint32_t d = *dst;
    2870                 :                 
    2871               0 :                 __m128i ms = unpack_32_1x128 (s);
    2872               0 :                 __m128i alpha = expand_alpha_1x128 (ms);
    2873               0 :                 __m128i mask  = xmm_mask;
    2874               0 :                 __m128i dest  = unpack_32_1x128 (d);
    2875                 :                 
    2876               0 :                 *dst = pack_1x128_32 (
    2877                 :                     in_over_1x128 (&ms, &alpha, &mask, &dest));
    2878                 :             }
    2879                 : 
    2880               0 :             dst++;
    2881               0 :             w--;
    2882                 :         }
    2883                 :     }
    2884                 : 
    2885               0 : }
    2886                 : 
    2887                 : static void
    2888              10 : sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
    2889                 :                               pixman_op_t              op,
    2890                 :                               pixman_image_t *         src_image,
    2891                 :                               pixman_image_t *         mask_image,
    2892                 :                               pixman_image_t *         dst_image,
    2893                 :                               int32_t                  src_x,
    2894                 :                               int32_t                  src_y,
    2895                 :                               int32_t                  mask_x,
    2896                 :                               int32_t                  mask_y,
    2897                 :                               int32_t                  dest_x,
    2898                 :                               int32_t                  dest_y,
    2899                 :                               int32_t                  width,
    2900                 :                               int32_t                  height)
    2901                 : {
    2902                 :     uint32_t    *dst_line, *dst;
    2903                 :     uint32_t    *src_line, *src;
    2904                 :     int32_t w;
    2905                 :     int dst_stride, src_stride;
    2906                 : 
    2907                 : 
    2908              10 :     PIXMAN_IMAGE_GET_LINE (
    2909                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2910              10 :     PIXMAN_IMAGE_GET_LINE (
    2911                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2912                 : 
    2913             411 :     while (height--)
    2914                 :     {
    2915             391 :         dst = dst_line;
    2916             391 :         dst_line += dst_stride;
    2917             391 :         src = src_line;
    2918             391 :         src_line += src_stride;
    2919             391 :         w = width;
    2920                 : 
    2921            1022 :         while (w && (unsigned long)dst & 15)
    2922                 :         {
    2923             240 :             *dst++ = *src++ | 0xff000000;
    2924             240 :             w--;
    2925                 :         }
    2926                 : 
    2927            1260 :         while (w >= 16)
    2928                 :         {
    2929                 :             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
    2930                 :             
    2931             956 :             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
    2932             956 :             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
    2933             956 :             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
    2934             956 :             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
    2935                 :             
    2936             956 :             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
    2937             956 :             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
    2938             956 :             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
    2939             956 :             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
    2940                 :             
    2941             478 :             dst += 16;
    2942             478 :             src += 16;
    2943             478 :             w -= 16;
    2944                 :         }
    2945                 : 
    2946            1038 :         while (w)
    2947                 :         {
    2948             256 :             *dst++ = *src++ | 0xff000000;
    2949             256 :             w--;
    2950                 :         }
    2951                 :     }
    2952                 : 
    2953              10 : }
    2954                 : 
    2955                 : static void
    2956               0 : sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
    2957                 :                                  pixman_op_t              op,
    2958                 :                                  pixman_image_t *         src_image,
    2959                 :                                  pixman_image_t *         mask_image,
    2960                 :                                  pixman_image_t *         dst_image,
    2961                 :                                  int32_t                  src_x,
    2962                 :                                  int32_t                  src_y,
    2963                 :                                  int32_t                  mask_x,
    2964                 :                                  int32_t                  mask_y,
    2965                 :                                  int32_t                  dest_x,
    2966                 :                                  int32_t                  dest_y,
    2967                 :                                  int32_t                  width,
    2968                 :                                  int32_t                  height)
    2969                 : {
    2970                 :     uint32_t    *dst_line, *dst;
    2971                 :     uint32_t    *src_line, *src;
    2972                 :     uint32_t mask;
    2973                 :     int dst_stride, src_stride;
    2974                 :     int32_t w;
    2975                 : 
    2976                 :     __m128i xmm_mask, xmm_alpha;
    2977                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    2978                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    2979                 : 
    2980               0 :     PIXMAN_IMAGE_GET_LINE (
    2981                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    2982               0 :     PIXMAN_IMAGE_GET_LINE (
    2983                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    2984                 : 
    2985               0 :     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
    2986                 : 
    2987               0 :     xmm_mask = create_mask_16_128 (mask >> 24);
    2988               0 :     xmm_alpha = mask_00ff;
    2989                 : 
    2990               0 :     while (height--)
    2991                 :     {
    2992               0 :         dst = dst_line;
    2993               0 :         dst_line += dst_stride;
    2994               0 :         src = src_line;
    2995               0 :         src_line += src_stride;
    2996               0 :         w = width;
    2997                 : 
    2998               0 :         while (w && (unsigned long)dst & 15)
    2999                 :         {
    3000               0 :             uint32_t s = (*src++) | 0xff000000;
    3001               0 :             uint32_t d = *dst;
    3002                 : 
    3003               0 :             __m128i src   = unpack_32_1x128 (s);
    3004               0 :             __m128i alpha = xmm_alpha;
    3005               0 :             __m128i mask  = xmm_mask;
    3006               0 :             __m128i dest  = unpack_32_1x128 (d);
    3007                 : 
    3008               0 :             *dst++ = pack_1x128_32 (
    3009                 :                 in_over_1x128 (&src, &alpha, &mask, &dest));
    3010                 : 
    3011               0 :             w--;
    3012                 :         }
    3013                 : 
    3014               0 :         while (w >= 4)
    3015                 :         {
    3016               0 :             xmm_src = _mm_or_si128 (
    3017                 :                 load_128_unaligned ((__m128i*)src), mask_ff000000);
    3018               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    3019                 : 
    3020               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3021               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    3022                 : 
    3023                 :             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3024                 :                            &xmm_alpha, &xmm_alpha,
    3025                 :                            &xmm_mask, &xmm_mask,
    3026                 :                            &xmm_dst_lo, &xmm_dst_hi);
    3027                 : 
    3028               0 :             save_128_aligned (
    3029                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3030                 : 
    3031               0 :             dst += 4;
    3032               0 :             src += 4;
    3033               0 :             w -= 4;
    3034                 : 
    3035                 :         }
    3036                 : 
    3037               0 :         while (w)
    3038                 :         {
    3039               0 :             uint32_t s = (*src++) | 0xff000000;
    3040               0 :             uint32_t d = *dst;
    3041                 : 
    3042               0 :             __m128i src  = unpack_32_1x128 (s);
    3043               0 :             __m128i alpha = xmm_alpha;
    3044               0 :             __m128i mask  = xmm_mask;
    3045               0 :             __m128i dest  = unpack_32_1x128 (d);
    3046                 : 
    3047               0 :             *dst++ = pack_1x128_32 (
    3048                 :                 in_over_1x128 (&src, &alpha, &mask, &dest));
    3049                 : 
    3050               0 :             w--;
    3051                 :         }
    3052                 :     }
    3053                 : 
    3054               0 : }
    3055                 : 
    3056                 : static void
    3057               0 : sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
    3058                 :                                pixman_op_t              op,
    3059                 :                                pixman_image_t *         src_image,
    3060                 :                                pixman_image_t *         mask_image,
    3061                 :                                pixman_image_t *         dst_image,
    3062                 :                                int32_t                  src_x,
    3063                 :                                int32_t                  src_y,
    3064                 :                                int32_t                  mask_x,
    3065                 :                                int32_t                  mask_y,
    3066                 :                                int32_t                  dest_x,
    3067                 :                                int32_t                  dest_y,
    3068                 :                                int32_t                  width,
    3069                 :                                int32_t                  height)
    3070                 : {
    3071                 :     int dst_stride, src_stride;
    3072                 :     uint32_t    *dst_line, *dst;
    3073                 :     uint32_t    *src_line, *src;
    3074                 : 
    3075               0 :     PIXMAN_IMAGE_GET_LINE (
    3076                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3077               0 :     PIXMAN_IMAGE_GET_LINE (
    3078                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3079                 : 
    3080               0 :     dst = dst_line;
    3081               0 :     src = src_line;
    3082                 : 
    3083               0 :     while (height--)
    3084                 :     {
    3085                 :         sse2_combine_over_u (imp, op, dst, src, NULL, width);
    3086                 : 
    3087               0 :         dst += dst_stride;
    3088               0 :         src += src_stride;
    3089                 :     }
    3090               0 : }
    3091                 : 
    3092                 : static force_inline uint16_t
    3093                 : composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
    3094                 : {
    3095                 :     __m128i ms;
    3096                 : 
    3097               0 :     ms = unpack_32_1x128 (src);
    3098               0 :     return pack_565_32_16 (
    3099                 :         pack_1x128_32 (
    3100                 :             over_1x128 (
    3101                 :                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
    3102                 : }
    3103                 : 
    3104                 : static void
    3105               0 : sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
    3106                 :                                pixman_op_t              op,
    3107                 :                                pixman_image_t *         src_image,
    3108                 :                                pixman_image_t *         mask_image,
    3109                 :                                pixman_image_t *         dst_image,
    3110                 :                                int32_t                  src_x,
    3111                 :                                int32_t                  src_y,
    3112                 :                                int32_t                  mask_x,
    3113                 :                                int32_t                  mask_y,
    3114                 :                                int32_t                  dest_x,
    3115                 :                                int32_t                  dest_y,
    3116                 :                                int32_t                  width,
    3117                 :                                int32_t                  height)
    3118                 : {
    3119                 :     uint16_t    *dst_line, *dst, d;
    3120                 :     uint32_t    *src_line, *src, s;
    3121                 :     int dst_stride, src_stride;
    3122                 :     int32_t w;
    3123                 : 
    3124                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    3125                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    3126                 :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3127                 : 
    3128               0 :     PIXMAN_IMAGE_GET_LINE (
    3129                 :         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3130               0 :     PIXMAN_IMAGE_GET_LINE (
    3131                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3132                 : 
    3133               0 :     while (height--)
    3134                 :     {
    3135               0 :         dst = dst_line;
    3136               0 :         src = src_line;
    3137                 : 
    3138               0 :         dst_line += dst_stride;
    3139               0 :         src_line += src_stride;
    3140               0 :         w = width;
    3141                 : 
    3142                 :         /* Align dst on a 16-byte boundary */
    3143               0 :         while (w &&
    3144               0 :                ((unsigned long)dst & 15))
    3145                 :         {
    3146               0 :             s = *src++;
    3147               0 :             d = *dst;
    3148                 : 
    3149               0 :             *dst++ = composite_over_8888_0565pixel (s, d);
    3150               0 :             w--;
    3151                 :         }
    3152                 : 
    3153                 :         /* It's a 8 pixel loop */
    3154               0 :         while (w >= 8)
    3155                 :         {
    3156                 :             /* I'm loading unaligned because I'm not sure
    3157                 :              * about the address alignment.
    3158                 :              */
    3159               0 :             xmm_src = load_128_unaligned ((__m128i*) src);
    3160               0 :             xmm_dst = load_128_aligned ((__m128i*) dst);
    3161                 : 
    3162                 :             /* Unpacking */
    3163               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3164               0 :             unpack_565_128_4x128 (xmm_dst,
    3165                 :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3166               0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    3167                 :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    3168                 : 
    3169                 :             /* I'm loading next 4 pixels from memory
    3170                 :              * before to optimze the memory read.
    3171                 :              */
    3172               0 :             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
    3173                 : 
    3174                 :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3175                 :                         &xmm_alpha_lo, &xmm_alpha_hi,
    3176                 :                         &xmm_dst0, &xmm_dst1);
    3177                 : 
    3178                 :             /* Unpacking */
    3179               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3180               0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    3181                 :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    3182                 : 
    3183                 :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    3184                 :                         &xmm_alpha_lo, &xmm_alpha_hi,
    3185                 :                         &xmm_dst2, &xmm_dst3);
    3186                 : 
    3187               0 :             save_128_aligned (
    3188                 :                 (__m128i*)dst, pack_565_4x128_128 (
    3189                 :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3190                 : 
    3191               0 :             w -= 8;
    3192               0 :             dst += 8;
    3193               0 :             src += 8;
    3194                 :         }
    3195                 : 
    3196               0 :         while (w--)
    3197                 :         {
    3198               0 :             s = *src++;
    3199               0 :             d = *dst;
    3200                 : 
    3201               0 :             *dst++ = composite_over_8888_0565pixel (s, d);
    3202                 :         }
    3203                 :     }
    3204                 : 
    3205               0 : }
    3206                 : 
    3207                 : static void
    3208               0 : sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
    3209                 :                               pixman_op_t              op,
    3210                 :                               pixman_image_t *         src_image,
    3211                 :                               pixman_image_t *         mask_image,
    3212                 :                               pixman_image_t *         dst_image,
    3213                 :                               int32_t                  src_x,
    3214                 :                               int32_t                  src_y,
    3215                 :                               int32_t                  mask_x,
    3216                 :                               int32_t                  mask_y,
    3217                 :                               int32_t                  dest_x,
    3218                 :                               int32_t                  dest_y,
    3219                 :                               int32_t                  width,
    3220                 :                               int32_t                  height)
    3221                 : {
    3222                 :     uint32_t src, srca;
    3223                 :     uint32_t *dst_line, *dst;
    3224                 :     uint8_t *mask_line, *mask;
    3225                 :     int dst_stride, mask_stride;
    3226                 :     int32_t w;
    3227                 :     uint32_t m, d;
    3228                 : 
    3229                 :     __m128i xmm_src, xmm_alpha, xmm_def;
    3230                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    3231                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3232                 : 
    3233                 :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    3234                 : 
    3235               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    3236                 : 
    3237               0 :     srca = src >> 24;
    3238               0 :     if (src == 0)
    3239               0 :         return;
    3240                 : 
    3241               0 :     PIXMAN_IMAGE_GET_LINE (
    3242                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3243               0 :     PIXMAN_IMAGE_GET_LINE (
    3244                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3245                 : 
    3246               0 :     xmm_def = create_mask_2x32_128 (src, src);
    3247               0 :     xmm_src = expand_pixel_32_1x128 (src);
    3248               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    3249               0 :     mmx_src   = xmm_src;
    3250               0 :     mmx_alpha = xmm_alpha;
    3251                 : 
    3252               0 :     while (height--)
    3253                 :     {
    3254               0 :         dst = dst_line;
    3255               0 :         dst_line += dst_stride;
    3256               0 :         mask = mask_line;
    3257               0 :         mask_line += mask_stride;
    3258               0 :         w = width;
    3259                 : 
    3260               0 :         while (w && (unsigned long)dst & 15)
    3261                 :         {
    3262               0 :             uint8_t m = *mask++;
    3263                 : 
    3264               0 :             if (m)
    3265                 :             {
    3266               0 :                 d = *dst;
    3267               0 :                 mmx_mask = expand_pixel_8_1x128 (m);
    3268               0 :                 mmx_dest = unpack_32_1x128 (d);
    3269                 : 
    3270               0 :                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
    3271                 :                                                    &mmx_alpha,
    3272                 :                                                    &mmx_mask,
    3273                 :                                                    &mmx_dest));
    3274                 :             }
    3275                 : 
    3276               0 :             w--;
    3277               0 :             dst++;
    3278                 :         }
    3279                 : 
    3280               0 :         while (w >= 4)
    3281                 :         {
    3282               0 :             m = *((uint32_t*)mask);
    3283                 : 
    3284               0 :             if (srca == 0xff && m == 0xffffffff)
    3285                 :             {
    3286               0 :                 save_128_aligned ((__m128i*)dst, xmm_def);
    3287                 :             }
    3288               0 :             else if (m)
    3289                 :             {
    3290               0 :                 xmm_dst = load_128_aligned ((__m128i*) dst);
    3291               0 :                 xmm_mask = unpack_32_1x128 (m);
    3292               0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3293                 : 
    3294                 :                 /* Unpacking */
    3295               0 :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    3296               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3297                 : 
    3298               0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3299                 :                                         &xmm_mask_lo, &xmm_mask_hi);
    3300                 : 
    3301                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    3302                 :                                &xmm_alpha, &xmm_alpha,
    3303                 :                                &xmm_mask_lo, &xmm_mask_hi,
    3304                 :                                &xmm_dst_lo, &xmm_dst_hi);
    3305                 : 
    3306               0 :                 save_128_aligned (
    3307                 :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3308                 :             }
    3309                 : 
    3310               0 :             w -= 4;
    3311               0 :             dst += 4;
    3312               0 :             mask += 4;
    3313                 :         }
    3314                 : 
    3315               0 :         while (w)
    3316                 :         {
    3317               0 :             uint8_t m = *mask++;
    3318                 : 
    3319               0 :             if (m)
    3320                 :             {
    3321               0 :                 d = *dst;
    3322               0 :                 mmx_mask = expand_pixel_8_1x128 (m);
    3323               0 :                 mmx_dest = unpack_32_1x128 (d);
    3324                 : 
    3325               0 :                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
    3326                 :                                                    &mmx_alpha,
    3327                 :                                                    &mmx_mask,
    3328                 :                                                    &mmx_dest));
    3329                 :             }
    3330                 : 
    3331               0 :             w--;
    3332               0 :             dst++;
    3333                 :         }
    3334                 :     }
    3335                 : 
    3336                 : }
    3337                 : 
    3338                 : static pixman_bool_t
    3339              17 : pixman_fill_sse2 (uint32_t *bits,
    3340                 :                   int       stride,
    3341                 :                   int       bpp,
    3342                 :                   int       x,
    3343                 :                   int       y,
    3344                 :                   int       width,
    3345                 :                   int       height,
    3346                 :                   uint32_t  data)
    3347                 : {
    3348                 :     uint32_t byte_width;
    3349                 :     uint8_t         *byte_line;
    3350                 : 
    3351                 :     __m128i xmm_def;
    3352                 : 
    3353              17 :     if (bpp == 8)
    3354                 :     {
    3355                 :         uint8_t b;
    3356                 :         uint16_t w;
    3357                 : 
    3358               0 :         stride = stride * (int) sizeof (uint32_t) / 1;
    3359               0 :         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
    3360               0 :         byte_width = width;
    3361               0 :         stride *= 1;
    3362                 : 
    3363               0 :         b = data & 0xff;
    3364               0 :         w = (b << 8) | b;
    3365               0 :         data = (w << 16) | w;
    3366                 :     }
    3367              17 :     else if (bpp == 16)
    3368                 :     {
    3369               0 :         stride = stride * (int) sizeof (uint32_t) / 2;
    3370               0 :         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
    3371               0 :         byte_width = 2 * width;
    3372               0 :         stride *= 2;
    3373                 : 
    3374               0 :         data = (data & 0xffff) * 0x00010001;
    3375                 :     }
    3376              17 :     else if (bpp == 32)
    3377                 :     {
    3378              17 :         stride = stride * (int) sizeof (uint32_t) / 4;
    3379              17 :         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
    3380              17 :         byte_width = 4 * width;
    3381              17 :         stride *= 4;
    3382                 :     }
    3383                 :     else
    3384                 :     {
    3385               0 :         return FALSE;
    3386                 :     }
    3387                 : 
    3388              17 :     xmm_def = create_mask_2x32_128 (data, data);
    3389                 : 
    3390             559 :     while (height--)
    3391                 :     {
    3392                 :         int w;
    3393             525 :         uint8_t *d = byte_line;
    3394             525 :         byte_line += stride;
    3395             525 :         w = byte_width;
    3396                 : 
    3397            1050 :         while (w >= 1 && ((unsigned long)d & 1))
    3398                 :         {
    3399               0 :             *(uint8_t *)d = data;
    3400               0 :             w -= 1;
    3401               0 :             d += 1;
    3402                 :         }
    3403                 : 
    3404            1050 :         while (w >= 2 && ((unsigned long)d & 3))
    3405                 :         {
    3406               0 :             *(uint16_t *)d = data;
    3407               0 :             w -= 2;
    3408               0 :             d += 2;
    3409                 :         }
    3410                 : 
    3411            1293 :         while (w >= 4 && ((unsigned long)d & 15))
    3412                 :         {
    3413             243 :             *(uint32_t *)d = data;
    3414                 : 
    3415             243 :             w -= 4;
    3416             243 :             d += 4;
    3417                 :         }
    3418                 : 
    3419            1545 :         while (w >= 128)
    3420                 :         {
    3421             495 :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3422             495 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3423             495 :             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
    3424             495 :             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
    3425             495 :             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
    3426             495 :             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
    3427             495 :             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
    3428             495 :             save_128_aligned ((__m128i*)(d + 112), xmm_def);
    3429                 : 
    3430             495 :             d += 128;
    3431             495 :             w -= 128;
    3432                 :         }
    3433                 : 
    3434             525 :         if (w >= 64)
    3435                 :         {
    3436              64 :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3437              64 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3438              64 :             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
    3439              64 :             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
    3440                 : 
    3441              64 :             d += 64;
    3442              64 :             w -= 64;
    3443                 :         }
    3444                 : 
    3445             525 :         if (w >= 32)
    3446                 :         {
    3447               0 :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3448               0 :             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
    3449                 : 
    3450               0 :             d += 32;
    3451               0 :             w -= 32;
    3452                 :         }
    3453                 : 
    3454             525 :         if (w >= 16)
    3455                 :         {
    3456               4 :             save_128_aligned ((__m128i*)(d),     xmm_def);
    3457                 : 
    3458               4 :             d += 16;
    3459               4 :             w -= 16;
    3460                 :         }
    3461                 : 
    3462            1299 :         while (w >= 4)
    3463                 :         {
    3464             249 :             *(uint32_t *)d = data;
    3465                 : 
    3466             249 :             w -= 4;
    3467             249 :             d += 4;
    3468                 :         }
    3469                 : 
    3470             525 :         if (w >= 2)
    3471                 :         {
    3472               0 :             *(uint16_t *)d = data;
    3473               0 :             w -= 2;
    3474               0 :             d += 2;
    3475                 :         }
    3476                 : 
    3477             525 :         if (w >= 1)
    3478                 :         {
    3479               0 :             *(uint8_t *)d = data;
    3480               0 :             w -= 1;
    3481               0 :             d += 1;
    3482                 :         }
    3483                 :     }
    3484                 : 
    3485              17 :     return TRUE;
    3486                 : }
    3487                 : 
    3488                 : static void
    3489               0 : sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
    3490                 :                              pixman_op_t              op,
    3491                 :                              pixman_image_t *         src_image,
    3492                 :                              pixman_image_t *         mask_image,
    3493                 :                              pixman_image_t *         dst_image,
    3494                 :                              int32_t                  src_x,
    3495                 :                              int32_t                  src_y,
    3496                 :                              int32_t                  mask_x,
    3497                 :                              int32_t                  mask_y,
    3498                 :                              int32_t                  dest_x,
    3499                 :                              int32_t                  dest_y,
    3500                 :                              int32_t                  width,
    3501                 :                              int32_t                  height)
    3502                 : {
    3503                 :     uint32_t src, srca;
    3504                 :     uint32_t    *dst_line, *dst;
    3505                 :     uint8_t     *mask_line, *mask;
    3506                 :     int dst_stride, mask_stride;
    3507                 :     int32_t w;
    3508                 :     uint32_t m;
    3509                 : 
    3510                 :     __m128i xmm_src, xmm_def;
    3511                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3512                 : 
    3513               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    3514                 : 
    3515               0 :     srca = src >> 24;
    3516               0 :     if (src == 0)
    3517                 :     {
    3518               0 :         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
    3519               0 :                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
    3520                 :                           dest_x, dest_y, width, height, 0);
    3521               0 :         return;
    3522                 :     }
    3523                 : 
    3524               0 :     PIXMAN_IMAGE_GET_LINE (
    3525                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3526               0 :     PIXMAN_IMAGE_GET_LINE (
    3527                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3528                 : 
    3529               0 :     xmm_def = create_mask_2x32_128 (src, src);
    3530               0 :     xmm_src = expand_pixel_32_1x128 (src);
    3531                 : 
    3532               0 :     while (height--)
    3533                 :     {
    3534               0 :         dst = dst_line;
    3535               0 :         dst_line += dst_stride;
    3536               0 :         mask = mask_line;
    3537               0 :         mask_line += mask_stride;
    3538               0 :         w = width;
    3539                 : 
    3540               0 :         while (w && (unsigned long)dst & 15)
    3541                 :         {
    3542               0 :             uint8_t m = *mask++;
    3543                 : 
    3544               0 :             if (m)
    3545                 :             {
    3546               0 :                 *dst = pack_1x128_32 (
    3547                 :                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
    3548                 :             }
    3549                 :             else
    3550                 :             {
    3551               0 :                 *dst = 0;
    3552                 :             }
    3553                 : 
    3554               0 :             w--;
    3555               0 :             dst++;
    3556                 :         }
    3557                 : 
    3558               0 :         while (w >= 4)
    3559                 :         {
    3560               0 :             m = *((uint32_t*)mask);
    3561                 : 
    3562               0 :             if (srca == 0xff && m == 0xffffffff)
    3563                 :             {
    3564               0 :                 save_128_aligned ((__m128i*)dst, xmm_def);
    3565                 :             }
    3566               0 :             else if (m)
    3567                 :             {
    3568               0 :                 xmm_mask = unpack_32_1x128 (m);
    3569               0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3570                 : 
    3571                 :                 /* Unpacking */
    3572               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3573                 : 
    3574               0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3575                 :                                         &xmm_mask_lo, &xmm_mask_hi);
    3576                 : 
    3577                 :                 pix_multiply_2x128 (&xmm_src, &xmm_src,
    3578                 :                                     &xmm_mask_lo, &xmm_mask_hi,
    3579                 :                                     &xmm_mask_lo, &xmm_mask_hi);
    3580                 : 
    3581               0 :                 save_128_aligned (
    3582                 :                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
    3583                 :             }
    3584                 :             else
    3585                 :             {
    3586               0 :                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
    3587                 :             }
    3588                 : 
    3589               0 :             w -= 4;
    3590               0 :             dst += 4;
    3591               0 :             mask += 4;
    3592                 :         }
    3593                 : 
    3594               0 :         while (w)
    3595                 :         {
    3596               0 :             uint8_t m = *mask++;
    3597                 : 
    3598               0 :             if (m)
    3599                 :             {
    3600               0 :                 *dst = pack_1x128_32 (
    3601                 :                     pix_multiply_1x128 (
    3602                 :                         xmm_src, expand_pixel_8_1x128 (m)));
    3603                 :             }
    3604                 :             else
    3605                 :             {
    3606               0 :                 *dst = 0;
    3607                 :             }
    3608                 : 
    3609               0 :             w--;
    3610               0 :             dst++;
    3611                 :         }
    3612                 :     }
    3613                 : 
    3614                 : }
    3615                 : 
    3616                 : static void
    3617               0 : sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
    3618                 :                               pixman_op_t              op,
    3619                 :                               pixman_image_t *         src_image,
    3620                 :                               pixman_image_t *         mask_image,
    3621                 :                               pixman_image_t *         dst_image,
    3622                 :                               int32_t                  src_x,
    3623                 :                               int32_t                  src_y,
    3624                 :                               int32_t                  mask_x,
    3625                 :                               int32_t                  mask_y,
    3626                 :                               int32_t                  dest_x,
    3627                 :                               int32_t                  dest_y,
    3628                 :                               int32_t                  width,
    3629                 :                               int32_t                  height)
    3630                 : {
    3631                 :     uint32_t src, srca;
    3632                 :     uint16_t    *dst_line, *dst, d;
    3633                 :     uint8_t     *mask_line, *mask;
    3634                 :     int dst_stride, mask_stride;
    3635                 :     int32_t w;
    3636                 :     uint32_t m;
    3637                 :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    3638                 : 
    3639                 :     __m128i xmm_src, xmm_alpha;
    3640                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    3641                 :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3642                 : 
    3643               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    3644                 : 
    3645               0 :     srca = src >> 24;
    3646               0 :     if (src == 0)
    3647               0 :         return;
    3648                 : 
    3649               0 :     PIXMAN_IMAGE_GET_LINE (
    3650                 :         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3651               0 :     PIXMAN_IMAGE_GET_LINE (
    3652                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    3653                 : 
    3654               0 :     xmm_src = expand_pixel_32_1x128 (src);
    3655               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    3656               0 :     mmx_src = xmm_src;
    3657               0 :     mmx_alpha = xmm_alpha;
    3658                 : 
    3659               0 :     while (height--)
    3660                 :     {
    3661               0 :         dst = dst_line;
    3662               0 :         dst_line += dst_stride;
    3663               0 :         mask = mask_line;
    3664               0 :         mask_line += mask_stride;
    3665               0 :         w = width;
    3666                 : 
    3667               0 :         while (w && (unsigned long)dst & 15)
    3668                 :         {
    3669               0 :             m = *mask++;
    3670                 : 
    3671               0 :             if (m)
    3672                 :             {
    3673               0 :                 d = *dst;
    3674               0 :                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    3675               0 :                 mmx_dest = expand565_16_1x128 (d);
    3676                 : 
    3677               0 :                 *dst = pack_565_32_16 (
    3678                 :                     pack_1x128_32 (
    3679                 :                         in_over_1x128 (
    3680                 :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    3681                 :             }
    3682                 : 
    3683               0 :             w--;
    3684               0 :             dst++;
    3685                 :         }
    3686                 : 
    3687               0 :         while (w >= 8)
    3688                 :         {
    3689               0 :             xmm_dst = load_128_aligned ((__m128i*) dst);
    3690               0 :             unpack_565_128_4x128 (xmm_dst,
    3691                 :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3692                 : 
    3693               0 :             m = *((uint32_t*)mask);
    3694               0 :             mask += 4;
    3695                 : 
    3696               0 :             if (m)
    3697                 :             {
    3698               0 :                 xmm_mask = unpack_32_1x128 (m);
    3699               0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3700                 : 
    3701                 :                 /* Unpacking */
    3702               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3703                 : 
    3704               0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3705                 :                                         &xmm_mask_lo, &xmm_mask_hi);
    3706                 : 
    3707                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    3708                 :                                &xmm_alpha, &xmm_alpha,
    3709                 :                                &xmm_mask_lo, &xmm_mask_hi,
    3710                 :                                &xmm_dst0, &xmm_dst1);
    3711                 :             }
    3712                 : 
    3713               0 :             m = *((uint32_t*)mask);
    3714               0 :             mask += 4;
    3715                 : 
    3716               0 :             if (m)
    3717                 :             {
    3718               0 :                 xmm_mask = unpack_32_1x128 (m);
    3719               0 :                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
    3720                 : 
    3721                 :                 /* Unpacking */
    3722               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    3723                 : 
    3724               0 :                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
    3725                 :                                         &xmm_mask_lo, &xmm_mask_hi);
    3726                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    3727                 :                                &xmm_alpha, &xmm_alpha,
    3728                 :                                &xmm_mask_lo, &xmm_mask_hi,
    3729                 :                                &xmm_dst2, &xmm_dst3);
    3730                 :             }
    3731                 : 
    3732               0 :             save_128_aligned (
    3733                 :                 (__m128i*)dst, pack_565_4x128_128 (
    3734                 :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3735                 : 
    3736               0 :             w -= 8;
    3737               0 :             dst += 8;
    3738                 :         }
    3739                 : 
    3740               0 :         while (w)
    3741                 :         {
    3742               0 :             m = *mask++;
    3743                 : 
    3744               0 :             if (m)
    3745                 :             {
    3746               0 :                 d = *dst;
    3747               0 :                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    3748               0 :                 mmx_dest = expand565_16_1x128 (d);
    3749                 : 
    3750               0 :                 *dst = pack_565_32_16 (
    3751                 :                     pack_1x128_32 (
    3752                 :                         in_over_1x128 (
    3753                 :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    3754                 :             }
    3755                 : 
    3756               0 :             w--;
    3757               0 :             dst++;
    3758                 :         }
    3759                 :     }
    3760                 : 
    3761                 : }
    3762                 : 
    3763                 : static void
    3764               0 : sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
    3765                 :                                  pixman_op_t              op,
    3766                 :                                  pixman_image_t *         src_image,
    3767                 :                                  pixman_image_t *         mask_image,
    3768                 :                                  pixman_image_t *         dst_image,
    3769                 :                                  int32_t                  src_x,
    3770                 :                                  int32_t                  src_y,
    3771                 :                                  int32_t                  mask_x,
    3772                 :                                  int32_t                  mask_y,
    3773                 :                                  int32_t                  dest_x,
    3774                 :                                  int32_t                  dest_y,
    3775                 :                                  int32_t                  width,
    3776                 :                                  int32_t                  height)
    3777                 : {
    3778                 :     uint16_t    *dst_line, *dst, d;
    3779                 :     uint32_t    *src_line, *src, s;
    3780                 :     int dst_stride, src_stride;
    3781                 :     int32_t w;
    3782                 :     uint32_t opaque, zero;
    3783                 : 
    3784                 :     __m128i ms;
    3785                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    3786                 :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    3787                 : 
    3788               0 :     PIXMAN_IMAGE_GET_LINE (
    3789                 :         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    3790               0 :     PIXMAN_IMAGE_GET_LINE (
    3791                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3792                 : 
    3793               0 :     while (height--)
    3794                 :     {
    3795               0 :         dst = dst_line;
    3796               0 :         dst_line += dst_stride;
    3797               0 :         src = src_line;
    3798               0 :         src_line += src_stride;
    3799               0 :         w = width;
    3800                 : 
    3801               0 :         while (w && (unsigned long)dst & 15)
    3802                 :         {
    3803               0 :             s = *src++;
    3804               0 :             d = *dst;
    3805                 : 
    3806               0 :             ms = unpack_32_1x128 (s);
    3807                 : 
    3808               0 :             *dst++ = pack_565_32_16 (
    3809                 :                 pack_1x128_32 (
    3810                 :                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
    3811               0 :             w--;
    3812                 :         }
    3813                 : 
    3814               0 :         while (w >= 8)
    3815                 :         {
    3816                 :             /* First round */
    3817               0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    3818               0 :             xmm_dst = load_128_aligned  ((__m128i*)dst);
    3819                 : 
    3820               0 :             opaque = is_opaque (xmm_src);
    3821               0 :             zero = is_zero (xmm_src);
    3822                 : 
    3823               0 :             unpack_565_128_4x128 (xmm_dst,
    3824                 :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    3825               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3826                 : 
    3827                 :             /* preload next round*/
    3828               0 :             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
    3829                 : 
    3830               0 :             if (opaque)
    3831                 :             {
    3832               0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3833                 :                                      &xmm_dst0, &xmm_dst1);
    3834                 :             }
    3835               0 :             else if (!zero)
    3836                 :             {
    3837               0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3838                 :                                         &xmm_dst0, &xmm_dst1);
    3839                 :             }
    3840                 : 
    3841                 :             /* Second round */
    3842               0 :             opaque = is_opaque (xmm_src);
    3843               0 :             zero = is_zero (xmm_src);
    3844                 : 
    3845               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    3846                 : 
    3847               0 :             if (opaque)
    3848                 :             {
    3849               0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3850                 :                                      &xmm_dst2, &xmm_dst3);
    3851                 :             }
    3852               0 :             else if (!zero)
    3853                 :             {
    3854               0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3855                 :                                         &xmm_dst2, &xmm_dst3);
    3856                 :             }
    3857                 : 
    3858               0 :             save_128_aligned (
    3859                 :                 (__m128i*)dst, pack_565_4x128_128 (
    3860                 :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    3861                 : 
    3862               0 :             w -= 8;
    3863               0 :             src += 8;
    3864               0 :             dst += 8;
    3865                 :         }
    3866                 : 
    3867               0 :         while (w)
    3868                 :         {
    3869               0 :             s = *src++;
    3870               0 :             d = *dst;
    3871                 : 
    3872               0 :             ms = unpack_32_1x128 (s);
    3873                 : 
    3874               0 :             *dst++ = pack_565_32_16 (
    3875                 :                 pack_1x128_32 (
    3876                 :                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
    3877               0 :             w--;
    3878                 :         }
    3879                 :     }
    3880                 : 
    3881               0 : }
    3882                 : 
    3883                 : static void
    3884               0 : sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
    3885                 :                                  pixman_op_t              op,
    3886                 :                                  pixman_image_t *         src_image,
    3887                 :                                  pixman_image_t *         mask_image,
    3888                 :                                  pixman_image_t *         dst_image,
    3889                 :                                  int32_t                  src_x,
    3890                 :                                  int32_t                  src_y,
    3891                 :                                  int32_t                  mask_x,
    3892                 :                                  int32_t                  mask_y,
    3893                 :                                  int32_t                  dest_x,
    3894                 :                                  int32_t                  dest_y,
    3895                 :                                  int32_t                  width,
    3896                 :                                  int32_t                  height)
    3897                 : {
    3898                 :     uint32_t    *dst_line, *dst, d;
    3899                 :     uint32_t    *src_line, *src, s;
    3900                 :     int dst_stride, src_stride;
    3901                 :     int32_t w;
    3902                 :     uint32_t opaque, zero;
    3903                 : 
    3904                 :     __m128i xmm_src_lo, xmm_src_hi;
    3905                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    3906                 : 
    3907               0 :     PIXMAN_IMAGE_GET_LINE (
    3908                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    3909               0 :     PIXMAN_IMAGE_GET_LINE (
    3910                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    3911                 : 
    3912               0 :     while (height--)
    3913                 :     {
    3914               0 :         dst = dst_line;
    3915               0 :         dst_line += dst_stride;
    3916               0 :         src = src_line;
    3917               0 :         src_line += src_stride;
    3918               0 :         w = width;
    3919                 : 
    3920               0 :         while (w && (unsigned long)dst & 15)
    3921                 :         {
    3922               0 :             s = *src++;
    3923               0 :             d = *dst;
    3924                 : 
    3925               0 :             *dst++ = pack_1x128_32 (
    3926                 :                 over_rev_non_pre_1x128 (
    3927                 :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    3928                 : 
    3929               0 :             w--;
    3930                 :         }
    3931                 : 
    3932               0 :         while (w >= 4)
    3933                 :         {
    3934               0 :             xmm_src_hi = load_128_unaligned ((__m128i*)src);
    3935                 : 
    3936               0 :             opaque = is_opaque (xmm_src_hi);
    3937               0 :             zero = is_zero (xmm_src_hi);
    3938                 : 
    3939               0 :             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    3940                 : 
    3941               0 :             if (opaque)
    3942                 :             {
    3943               0 :                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
    3944                 :                                      &xmm_dst_lo, &xmm_dst_hi);
    3945                 : 
    3946               0 :                 save_128_aligned (
    3947                 :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3948                 :             }
    3949               0 :             else if (!zero)
    3950                 :             {
    3951               0 :                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
    3952                 : 
    3953               0 :                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    3954                 : 
    3955               0 :                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
    3956                 :                                         &xmm_dst_lo, &xmm_dst_hi);
    3957                 : 
    3958               0 :                 save_128_aligned (
    3959                 :                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    3960                 :             }
    3961                 : 
    3962               0 :             w -= 4;
    3963               0 :             dst += 4;
    3964               0 :             src += 4;
    3965                 :         }
    3966                 : 
    3967               0 :         while (w)
    3968                 :         {
    3969               0 :             s = *src++;
    3970               0 :             d = *dst;
    3971                 : 
    3972               0 :             *dst++ = pack_1x128_32 (
    3973                 :                 over_rev_non_pre_1x128 (
    3974                 :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    3975                 : 
    3976               0 :             w--;
    3977                 :         }
    3978                 :     }
    3979                 : 
    3980               0 : }
    3981                 : 
    3982                 : static void
    3983               0 : sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
    3984                 :                                     pixman_op_t              op,
    3985                 :                                     pixman_image_t *         src_image,
    3986                 :                                     pixman_image_t *         mask_image,
    3987                 :                                     pixman_image_t *         dst_image,
    3988                 :                                     int32_t                  src_x,
    3989                 :                                     int32_t                  src_y,
    3990                 :                                     int32_t                  mask_x,
    3991                 :                                     int32_t                  mask_y,
    3992                 :                                     int32_t                  dest_x,
    3993                 :                                     int32_t                  dest_y,
    3994                 :                                     int32_t                  width,
    3995                 :                                     int32_t                  height)
    3996                 : {
    3997                 :     uint32_t src;
    3998                 :     uint16_t    *dst_line, *dst, d;
    3999                 :     uint32_t    *mask_line, *mask, m;
    4000                 :     int dst_stride, mask_stride;
    4001                 :     int w;
    4002                 :     uint32_t pack_cmp;
    4003                 : 
    4004                 :     __m128i xmm_src, xmm_alpha;
    4005                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4006                 :     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
    4007                 : 
    4008                 :     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
    4009                 : 
    4010               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    4011                 : 
    4012               0 :     if (src == 0)
    4013               0 :         return;
    4014                 : 
    4015               0 :     PIXMAN_IMAGE_GET_LINE (
    4016                 :         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
    4017               0 :     PIXMAN_IMAGE_GET_LINE (
    4018                 :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    4019                 : 
    4020               0 :     xmm_src = expand_pixel_32_1x128 (src);
    4021               0 :     xmm_alpha = expand_alpha_1x128 (xmm_src);
    4022               0 :     mmx_src = xmm_src;
    4023               0 :     mmx_alpha = xmm_alpha;
    4024                 : 
    4025               0 :     while (height--)
    4026                 :     {
    4027               0 :         w = width;
    4028               0 :         mask = mask_line;
    4029               0 :         dst = dst_line;
    4030               0 :         mask_line += mask_stride;
    4031               0 :         dst_line += dst_stride;
    4032                 : 
    4033               0 :         while (w && ((unsigned long)dst & 15))
    4034                 :         {
    4035               0 :             m = *(uint32_t *) mask;
    4036                 : 
    4037               0 :             if (m)
    4038                 :             {
    4039               0 :                 d = *dst;
    4040               0 :                 mmx_mask = unpack_32_1x128 (m);
    4041               0 :                 mmx_dest = expand565_16_1x128 (d);
    4042                 : 
    4043               0 :                 *dst = pack_565_32_16 (
    4044                 :                     pack_1x128_32 (
    4045                 :                         in_over_1x128 (
    4046                 :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    4047                 :             }
    4048                 : 
    4049               0 :             w--;
    4050               0 :             dst++;
    4051               0 :             mask++;
    4052                 :         }
    4053                 : 
    4054               0 :         while (w >= 8)
    4055                 :         {
    4056                 :             /* First round */
    4057               0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    4058               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4059                 : 
    4060               0 :             pack_cmp = _mm_movemask_epi8 (
    4061                 :                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    4062                 : 
    4063               0 :             unpack_565_128_4x128 (xmm_dst,
    4064                 :                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
    4065               0 :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4066                 : 
    4067                 :             /* preload next round */
    4068               0 :             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
    4069                 : 
    4070                 :             /* preload next round */
    4071               0 :             if (pack_cmp != 0xffff)
    4072                 :             {
    4073                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    4074                 :                                &xmm_alpha, &xmm_alpha,
    4075                 :                                &xmm_mask_lo, &xmm_mask_hi,
    4076                 :                                &xmm_dst0, &xmm_dst1);
    4077                 :             }
    4078                 : 
    4079                 :             /* Second round */
    4080               0 :             pack_cmp = _mm_movemask_epi8 (
    4081                 :                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
    4082                 : 
    4083               0 :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4084                 : 
    4085               0 :             if (pack_cmp != 0xffff)
    4086                 :             {
    4087                 :                 in_over_2x128 (&xmm_src, &xmm_src,
    4088                 :                                &xmm_alpha, &xmm_alpha,
    4089                 :                                &xmm_mask_lo, &xmm_mask_hi,
    4090                 :                                &xmm_dst2, &xmm_dst3);
    4091                 :             }
    4092                 : 
    4093               0 :             save_128_aligned (
    4094                 :                 (__m128i*)dst, pack_565_4x128_128 (
    4095                 :                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
    4096                 : 
    4097               0 :             w -= 8;
    4098               0 :             dst += 8;
    4099               0 :             mask += 8;
    4100                 :         }
    4101                 : 
    4102               0 :         while (w)
    4103                 :         {
    4104               0 :             m = *(uint32_t *) mask;
    4105                 : 
    4106               0 :             if (m)
    4107                 :             {
    4108               0 :                 d = *dst;
    4109               0 :                 mmx_mask = unpack_32_1x128 (m);
    4110               0 :                 mmx_dest = expand565_16_1x128 (d);
    4111                 : 
    4112               0 :                 *dst = pack_565_32_16 (
    4113                 :                     pack_1x128_32 (
    4114                 :                         in_over_1x128 (
    4115                 :                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
    4116                 :             }
    4117                 : 
    4118               0 :             w--;
    4119               0 :             dst++;
    4120               0 :             mask++;
    4121                 :         }
    4122                 :     }
    4123                 : 
    4124                 : }
    4125                 : 
    4126                 : static void
    4127               0 : sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
    4128                 :                          pixman_op_t              op,
    4129                 :                          pixman_image_t *         src_image,
    4130                 :                          pixman_image_t *         mask_image,
    4131                 :                          pixman_image_t *         dst_image,
    4132                 :                          int32_t                  src_x,
    4133                 :                          int32_t                  src_y,
    4134                 :                          int32_t                  mask_x,
    4135                 :                          int32_t                  mask_y,
    4136                 :                          int32_t                  dest_x,
    4137                 :                          int32_t                  dest_y,
    4138                 :                          int32_t                  width,
    4139                 :                          int32_t                  height)
    4140                 : {
    4141                 :     uint8_t     *dst_line, *dst;
    4142                 :     uint8_t     *mask_line, *mask;
    4143                 :     int dst_stride, mask_stride;
    4144                 :     uint32_t d, m;
    4145                 :     uint32_t src;
    4146                 :     uint8_t sa;
    4147                 :     int32_t w;
    4148                 : 
    4149                 :     __m128i xmm_alpha;
    4150                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4151                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4152                 : 
    4153               0 :     PIXMAN_IMAGE_GET_LINE (
    4154                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4155               0 :     PIXMAN_IMAGE_GET_LINE (
    4156                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4157                 : 
    4158               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    4159                 : 
    4160               0 :     sa = src >> 24;
    4161                 : 
    4162               0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4163                 : 
    4164               0 :     while (height--)
    4165                 :     {
    4166               0 :         dst = dst_line;
    4167               0 :         dst_line += dst_stride;
    4168               0 :         mask = mask_line;
    4169               0 :         mask_line += mask_stride;
    4170               0 :         w = width;
    4171                 : 
    4172               0 :         while (w && ((unsigned long)dst & 15))
    4173                 :         {
    4174               0 :             m = (uint32_t) *mask++;
    4175               0 :             d = (uint32_t) *dst;
    4176                 : 
    4177               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4178                 :                 pix_multiply_1x128 (
    4179                 :                     pix_multiply_1x128 (xmm_alpha,
    4180                 :                                        unpack_32_1x128 (m)),
    4181                 :                     unpack_32_1x128 (d)));
    4182               0 :             w--;
    4183                 :         }
    4184                 : 
    4185               0 :         while (w >= 16)
    4186                 :         {
    4187               0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    4188               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4189                 : 
    4190               0 :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4191               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4192                 : 
    4193                 :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4194                 :                                 &xmm_mask_lo, &xmm_mask_hi,
    4195                 :                                 &xmm_mask_lo, &xmm_mask_hi);
    4196                 : 
    4197                 :             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
    4198                 :                                 &xmm_dst_lo, &xmm_dst_hi,
    4199                 :                                 &xmm_dst_lo, &xmm_dst_hi);
    4200                 : 
    4201               0 :             save_128_aligned (
    4202                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4203                 : 
    4204               0 :             mask += 16;
    4205               0 :             dst += 16;
    4206               0 :             w -= 16;
    4207                 :         }
    4208                 : 
    4209               0 :         while (w)
    4210                 :         {
    4211               0 :             m = (uint32_t) *mask++;
    4212               0 :             d = (uint32_t) *dst;
    4213                 : 
    4214               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4215                 :                 pix_multiply_1x128 (
    4216                 :                     pix_multiply_1x128 (
    4217                 :                         xmm_alpha, unpack_32_1x128 (m)),
    4218                 :                     unpack_32_1x128 (d)));
    4219               0 :             w--;
    4220                 :         }
    4221                 :     }
    4222                 : 
    4223               0 : }
    4224                 : 
    4225                 : static void
    4226               0 : sse2_composite_in_n_8 (pixman_implementation_t *imp,
    4227                 :                        pixman_op_t              op,
    4228                 :                        pixman_image_t *         src_image,
    4229                 :                        pixman_image_t *         mask_image,
    4230                 :                        pixman_image_t *         dst_image,
    4231                 :                        int32_t                  src_x,
    4232                 :                        int32_t                  src_y,
    4233                 :                        int32_t                  mask_x,
    4234                 :                        int32_t                  mask_y,
    4235                 :                        int32_t                  dest_x,
    4236                 :                        int32_t                  dest_y,
    4237                 :                        int32_t                  width,
    4238                 :                        int32_t                  height)
    4239                 : {
    4240                 :     uint8_t     *dst_line, *dst;
    4241                 :     int dst_stride;
    4242                 :     uint32_t d;
    4243                 :     uint32_t src;
    4244                 :     int32_t w;
    4245                 : 
    4246                 :     __m128i xmm_alpha;
    4247                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4248                 : 
    4249               0 :     PIXMAN_IMAGE_GET_LINE (
    4250                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4251                 : 
    4252               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    4253                 : 
    4254               0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4255                 : 
    4256               0 :     src = src >> 24;
    4257                 : 
    4258               0 :     if (src == 0xff)
    4259               0 :         return;
    4260                 : 
    4261               0 :     if (src == 0x00)
    4262                 :     {
    4263               0 :         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
    4264                 :                      8, dest_x, dest_y, width, height, src);
    4265                 : 
    4266               0 :         return;
    4267                 :     }
    4268                 : 
    4269               0 :     while (height--)
    4270                 :     {
    4271               0 :         dst = dst_line;
    4272               0 :         dst_line += dst_stride;
    4273               0 :         w = width;
    4274                 : 
    4275               0 :         while (w && ((unsigned long)dst & 15))
    4276                 :         {
    4277               0 :             d = (uint32_t) *dst;
    4278                 : 
    4279               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4280                 :                 pix_multiply_1x128 (
    4281                 :                     xmm_alpha,
    4282                 :                     unpack_32_1x128 (d)));
    4283               0 :             w--;
    4284                 :         }
    4285                 : 
    4286               0 :         while (w >= 16)
    4287                 :         {
    4288               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4289                 : 
    4290               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4291                 :             
    4292                 :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4293                 :                                 &xmm_dst_lo, &xmm_dst_hi,
    4294                 :                                 &xmm_dst_lo, &xmm_dst_hi);
    4295                 : 
    4296               0 :             save_128_aligned (
    4297                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4298                 : 
    4299               0 :             dst += 16;
    4300               0 :             w -= 16;
    4301                 :         }
    4302                 : 
    4303               0 :         while (w)
    4304                 :         {
    4305               0 :             d = (uint32_t) *dst;
    4306                 : 
    4307               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4308                 :                 pix_multiply_1x128 (
    4309                 :                     xmm_alpha,
    4310                 :                     unpack_32_1x128 (d)));
    4311               0 :             w--;
    4312                 :         }
    4313                 :     }
    4314                 : 
    4315                 : }
    4316                 : 
    4317                 : static void
    4318               0 : sse2_composite_in_8_8 (pixman_implementation_t *imp,
    4319                 :                        pixman_op_t              op,
    4320                 :                        pixman_image_t *         src_image,
    4321                 :                        pixman_image_t *         mask_image,
    4322                 :                        pixman_image_t *         dst_image,
    4323                 :                        int32_t                  src_x,
    4324                 :                        int32_t                  src_y,
    4325                 :                        int32_t                  mask_x,
    4326                 :                        int32_t                  mask_y,
    4327                 :                        int32_t                  dest_x,
    4328                 :                        int32_t                  dest_y,
    4329                 :                        int32_t                  width,
    4330                 :                        int32_t                  height)
    4331                 : {
    4332                 :     uint8_t     *dst_line, *dst;
    4333                 :     uint8_t     *src_line, *src;
    4334                 :     int src_stride, dst_stride;
    4335                 :     int32_t w;
    4336                 :     uint32_t s, d;
    4337                 : 
    4338                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    4339                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4340                 : 
    4341               0 :     PIXMAN_IMAGE_GET_LINE (
    4342                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4343               0 :     PIXMAN_IMAGE_GET_LINE (
    4344                 :         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    4345                 : 
    4346               0 :     while (height--)
    4347                 :     {
    4348               0 :         dst = dst_line;
    4349               0 :         dst_line += dst_stride;
    4350               0 :         src = src_line;
    4351               0 :         src_line += src_stride;
    4352               0 :         w = width;
    4353                 : 
    4354               0 :         while (w && ((unsigned long)dst & 15))
    4355                 :         {
    4356               0 :             s = (uint32_t) *src++;
    4357               0 :             d = (uint32_t) *dst;
    4358                 : 
    4359               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4360                 :                 pix_multiply_1x128 (
    4361                 :                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
    4362               0 :             w--;
    4363                 :         }
    4364                 : 
    4365               0 :         while (w >= 16)
    4366                 :         {
    4367               0 :             xmm_src = load_128_unaligned ((__m128i*)src);
    4368               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4369                 : 
    4370               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    4371               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4372                 : 
    4373                 :             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    4374                 :                                 &xmm_dst_lo, &xmm_dst_hi,
    4375                 :                                 &xmm_dst_lo, &xmm_dst_hi);
    4376                 : 
    4377               0 :             save_128_aligned (
    4378                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4379                 : 
    4380               0 :             src += 16;
    4381               0 :             dst += 16;
    4382               0 :             w -= 16;
    4383                 :         }
    4384                 : 
    4385               0 :         while (w)
    4386                 :         {
    4387               0 :             s = (uint32_t) *src++;
    4388               0 :             d = (uint32_t) *dst;
    4389                 : 
    4390               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4391                 :                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
    4392               0 :             w--;
    4393                 :         }
    4394                 :     }
    4395                 : 
    4396               0 : }
    4397                 : 
    4398                 : static void
    4399               0 : sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
    4400                 :                           pixman_op_t              op,
    4401                 :                           pixman_image_t *         src_image,
    4402                 :                           pixman_image_t *         mask_image,
    4403                 :                           pixman_image_t *         dst_image,
    4404                 :                           int32_t                  src_x,
    4405                 :                           int32_t                  src_y,
    4406                 :                           int32_t                  mask_x,
    4407                 :                           int32_t                  mask_y,
    4408                 :                           int32_t                  dest_x,
    4409                 :                           int32_t                  dest_y,
    4410                 :                           int32_t                  width,
    4411                 :                           int32_t                  height)
    4412                 : {
    4413                 :     uint8_t     *dst_line, *dst;
    4414                 :     uint8_t     *mask_line, *mask;
    4415                 :     int dst_stride, mask_stride;
    4416                 :     int32_t w;
    4417                 :     uint32_t src;
    4418                 :     uint8_t sa;
    4419                 :     uint32_t m, d;
    4420                 : 
    4421                 :     __m128i xmm_alpha;
    4422                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4423                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4424                 : 
    4425               0 :     PIXMAN_IMAGE_GET_LINE (
    4426                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4427               0 :     PIXMAN_IMAGE_GET_LINE (
    4428                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4429                 : 
    4430               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    4431                 : 
    4432               0 :     sa = src >> 24;
    4433                 : 
    4434               0 :     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
    4435                 : 
    4436               0 :     while (height--)
    4437                 :     {
    4438               0 :         dst = dst_line;
    4439               0 :         dst_line += dst_stride;
    4440               0 :         mask = mask_line;
    4441               0 :         mask_line += mask_stride;
    4442               0 :         w = width;
    4443                 : 
    4444               0 :         while (w && ((unsigned long)dst & 15))
    4445                 :         {
    4446               0 :             m = (uint32_t) *mask++;
    4447               0 :             d = (uint32_t) *dst;
    4448                 : 
    4449               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4450                 :                 _mm_adds_epu16 (
    4451                 :                     pix_multiply_1x128 (
    4452                 :                         xmm_alpha, unpack_32_1x128 (m)),
    4453                 :                     unpack_32_1x128 (d)));
    4454               0 :             w--;
    4455                 :         }
    4456                 : 
    4457               0 :         while (w >= 16)
    4458                 :         {
    4459               0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    4460               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    4461                 : 
    4462               0 :             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4463               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4464                 : 
    4465                 :             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
    4466                 :                                 &xmm_mask_lo, &xmm_mask_hi,
    4467                 :                                 &xmm_mask_lo, &xmm_mask_hi);
    4468                 : 
    4469               0 :             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
    4470               0 :             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
    4471                 : 
    4472               0 :             save_128_aligned (
    4473                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4474                 : 
    4475               0 :             mask += 16;
    4476               0 :             dst += 16;
    4477               0 :             w -= 16;
    4478                 :         }
    4479                 : 
    4480               0 :         while (w)
    4481                 :         {
    4482               0 :             m = (uint32_t) *mask++;
    4483               0 :             d = (uint32_t) *dst;
    4484                 : 
    4485               0 :             *dst++ = (uint8_t) pack_1x128_32 (
    4486                 :                 _mm_adds_epu16 (
    4487                 :                     pix_multiply_1x128 (
    4488                 :                         xmm_alpha, unpack_32_1x128 (m)),
    4489                 :                     unpack_32_1x128 (d)));
    4490                 : 
    4491               0 :             w--;
    4492                 :         }
    4493                 :     }
    4494                 : 
    4495               0 : }
    4496                 : 
    4497                 : static void
    4498               0 : sse2_composite_add_n_8 (pixman_implementation_t *imp,
    4499                 :                         pixman_op_t              op,
    4500                 :                         pixman_image_t *         src_image,
    4501                 :                         pixman_image_t *         mask_image,
    4502                 :                         pixman_image_t *         dst_image,
    4503                 :                         int32_t                  src_x,
    4504                 :                         int32_t                  src_y,
    4505                 :                         int32_t                  mask_x,
    4506                 :                         int32_t                  mask_y,
    4507                 :                         int32_t                  dest_x,
    4508                 :                         int32_t                  dest_y,
    4509                 :                         int32_t                  width,
    4510                 :                         int32_t                  height)
    4511                 : {
    4512                 :     uint8_t     *dst_line, *dst;
    4513                 :     int dst_stride;
    4514                 :     int32_t w;
    4515                 :     uint32_t src;
    4516                 : 
    4517                 :     __m128i xmm_src;
    4518                 : 
    4519               0 :     PIXMAN_IMAGE_GET_LINE (
    4520                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4521                 : 
    4522               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    4523                 : 
    4524               0 :     src >>= 24;
    4525                 : 
    4526               0 :     if (src == 0x00)
    4527               0 :         return;
    4528                 : 
    4529               0 :     if (src == 0xff)
    4530                 :     {
    4531               0 :         pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
    4532                 :                      8, dest_x, dest_y, width, height, 0xff);
    4533                 : 
    4534               0 :         return;
    4535                 :     }
    4536                 : 
    4537               0 :     src = (src << 24) | (src << 16) | (src << 8) | src;
    4538               0 :     xmm_src = _mm_set_epi32 (src, src, src, src);
    4539                 : 
    4540               0 :     while (height--)
    4541                 :     {
    4542               0 :         dst = dst_line;
    4543               0 :         dst_line += dst_stride;
    4544               0 :         w = width;
    4545                 : 
    4546               0 :         while (w && ((unsigned long)dst & 15))
    4547                 :         {
    4548               0 :             *dst = (uint8_t)_mm_cvtsi128_si32 (
    4549                 :                 _mm_adds_epu8 (
    4550                 :                     xmm_src,
    4551               0 :                     _mm_cvtsi32_si128 (*dst)));
    4552                 : 
    4553               0 :             w--;
    4554               0 :             dst++;
    4555                 :         }
    4556                 : 
    4557               0 :         while (w >= 16)
    4558                 :         {
    4559               0 :             save_128_aligned (
    4560                 :                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
    4561                 : 
    4562               0 :             dst += 16;
    4563               0 :             w -= 16;
    4564                 :         }
    4565                 : 
    4566               0 :         while (w)
    4567                 :         {
    4568               0 :             *dst = (uint8_t)_mm_cvtsi128_si32 (
    4569                 :                 _mm_adds_epu8 (
    4570                 :                     xmm_src,
    4571               0 :                     _mm_cvtsi32_si128 (*dst)));
    4572                 : 
    4573               0 :             w--;
    4574               0 :             dst++;
    4575                 :         }
    4576                 :     }
    4577                 : 
    4578                 : }
    4579                 : 
    4580                 : static void
    4581               0 : sse2_composite_add_8_8 (pixman_implementation_t *imp,
    4582                 :                         pixman_op_t              op,
    4583                 :                         pixman_image_t *         src_image,
    4584                 :                         pixman_image_t *         mask_image,
    4585                 :                         pixman_image_t *         dst_image,
    4586                 :                         int32_t                  src_x,
    4587                 :                         int32_t                  src_y,
    4588                 :                         int32_t                  mask_x,
    4589                 :                         int32_t                  mask_y,
    4590                 :                         int32_t                  dest_x,
    4591                 :                         int32_t                  dest_y,
    4592                 :                         int32_t                  width,
    4593                 :                         int32_t                  height)
    4594                 : {
    4595                 :     uint8_t     *dst_line, *dst;
    4596                 :     uint8_t     *src_line, *src;
    4597                 :     int dst_stride, src_stride;
    4598                 :     int32_t w;
    4599                 :     uint16_t t;
    4600                 : 
    4601               0 :     PIXMAN_IMAGE_GET_LINE (
    4602                 :         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
    4603               0 :     PIXMAN_IMAGE_GET_LINE (
    4604                 :         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
    4605                 : 
    4606               0 :     while (height--)
    4607                 :     {
    4608               0 :         dst = dst_line;
    4609               0 :         src = src_line;
    4610                 : 
    4611               0 :         dst_line += dst_stride;
    4612               0 :         src_line += src_stride;
    4613               0 :         w = width;
    4614                 : 
    4615                 :         /* Small head */
    4616               0 :         while (w && (unsigned long)dst & 3)
    4617                 :         {
    4618               0 :             t = (*dst) + (*src++);
    4619               0 :             *dst++ = t | (0 - (t >> 8));
    4620               0 :             w--;
    4621                 :         }
    4622                 : 
    4623               0 :         sse2_combine_add_u (imp, op,
    4624                 :                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
    4625                 : 
    4626                 :         /* Small tail */
    4627               0 :         dst += w & 0xfffc;
    4628               0 :         src += w & 0xfffc;
    4629                 : 
    4630               0 :         w &= 3;
    4631                 : 
    4632               0 :         while (w)
    4633                 :         {
    4634               0 :             t = (*dst) + (*src++);
    4635               0 :             *dst++ = t | (0 - (t >> 8));
    4636               0 :             w--;
    4637                 :         }
    4638                 :     }
    4639                 : 
    4640               0 : }
    4641                 : 
    4642                 : static void
    4643               0 : sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
    4644                 :                               pixman_op_t              op,
    4645                 :                               pixman_image_t *         src_image,
    4646                 :                               pixman_image_t *         mask_image,
    4647                 :                               pixman_image_t *         dst_image,
    4648                 :                               int32_t                  src_x,
    4649                 :                               int32_t                  src_y,
    4650                 :                               int32_t                  mask_x,
    4651                 :                               int32_t                  mask_y,
    4652                 :                               int32_t                  dest_x,
    4653                 :                               int32_t                  dest_y,
    4654                 :                               int32_t                  width,
    4655                 :                               int32_t                  height)
    4656                 : {
    4657                 :     uint32_t    *dst_line, *dst;
    4658                 :     uint32_t    *src_line, *src;
    4659                 :     int dst_stride, src_stride;
    4660                 : 
    4661               0 :     PIXMAN_IMAGE_GET_LINE (
    4662                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4663               0 :     PIXMAN_IMAGE_GET_LINE (
    4664                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4665                 : 
    4666               0 :     while (height--)
    4667                 :     {
    4668               0 :         dst = dst_line;
    4669               0 :         dst_line += dst_stride;
    4670               0 :         src = src_line;
    4671               0 :         src_line += src_stride;
    4672                 : 
    4673                 :         sse2_combine_add_u (imp, op, dst, src, NULL, width);
    4674                 :     }
    4675                 : 
    4676               0 : }
    4677                 : 
    4678                 : static pixman_bool_t
    4679              27 : pixman_blt_sse2 (uint32_t *src_bits,
    4680                 :                  uint32_t *dst_bits,
    4681                 :                  int       src_stride,
    4682                 :                  int       dst_stride,
    4683                 :                  int       src_bpp,
    4684                 :                  int       dst_bpp,
    4685                 :                  int       src_x,
    4686                 :                  int       src_y,
    4687                 :                  int       dst_x,
    4688                 :                  int       dst_y,
    4689                 :                  int       width,
    4690                 :                  int       height)
    4691                 : {
    4692                 :     uint8_t *   src_bytes;
    4693                 :     uint8_t *   dst_bytes;
    4694                 :     int byte_width;
    4695                 : 
    4696              27 :     if (src_bpp != dst_bpp)
    4697               0 :         return FALSE;
    4698                 : 
    4699              27 :     if (src_bpp == 16)
    4700                 :     {
    4701               0 :         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
    4702               0 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
    4703               0 :         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
    4704               0 :         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
    4705               0 :         byte_width = 2 * width;
    4706               0 :         src_stride *= 2;
    4707               0 :         dst_stride *= 2;
    4708                 :     }
    4709              27 :     else if (src_bpp == 32)
    4710                 :     {
    4711              27 :         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
    4712              27 :         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
    4713              27 :         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
    4714              27 :         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
    4715              27 :         byte_width = 4 * width;
    4716              27 :         src_stride *= 4;
    4717              27 :         dst_stride *= 4;
    4718                 :     }
    4719                 :     else
    4720                 :     {
    4721               0 :         return FALSE;
    4722                 :     }
    4723                 : 
    4724            1047 :     while (height--)
    4725                 :     {
    4726                 :         int w;
    4727             993 :         uint8_t *s = src_bytes;
    4728             993 :         uint8_t *d = dst_bytes;
    4729             993 :         src_bytes += src_stride;
    4730             993 :         dst_bytes += dst_stride;
    4731             993 :         w = byte_width;
    4732                 : 
    4733            1986 :         while (w >= 2 && ((unsigned long)d & 3))
    4734                 :         {
    4735               0 :             *(uint16_t *)d = *(uint16_t *)s;
    4736               0 :             w -= 2;
    4737               0 :             s += 2;
    4738               0 :             d += 2;
    4739                 :         }
    4740                 : 
    4741            2382 :         while (w >= 4 && ((unsigned long)d & 15))
    4742                 :         {
    4743             396 :             *(uint32_t *)d = *(uint32_t *)s;
    4744                 : 
    4745             396 :             w -= 4;
    4746             396 :             s += 4;
    4747             396 :             d += 4;
    4748                 :         }
    4749                 : 
    4750            4576 :         while (w >= 64)
    4751                 :         {
    4752                 :             __m128i xmm0, xmm1, xmm2, xmm3;
    4753                 : 
    4754            5180 :             xmm0 = load_128_unaligned ((__m128i*)(s));
    4755            5180 :             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
    4756            5180 :             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
    4757            5180 :             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
    4758                 : 
    4759            2590 :             save_128_aligned ((__m128i*)(d),    xmm0);
    4760            2590 :             save_128_aligned ((__m128i*)(d + 16), xmm1);
    4761            2590 :             save_128_aligned ((__m128i*)(d + 32), xmm2);
    4762            2590 :             save_128_aligned ((__m128i*)(d + 48), xmm3);
    4763                 : 
    4764            2590 :             s += 64;
    4765            2590 :             d += 64;
    4766            2590 :             w -= 64;
    4767                 :         }
    4768                 : 
    4769            2190 :         while (w >= 16)
    4770                 :         {
    4771             408 :             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
    4772                 : 
    4773             204 :             w -= 16;
    4774             204 :             d += 16;
    4775             204 :             s += 16;
    4776                 :         }
    4777                 : 
    4778            2388 :         while (w >= 4)
    4779                 :         {
    4780             402 :             *(uint32_t *)d = *(uint32_t *)s;
    4781                 : 
    4782             402 :             w -= 4;
    4783             402 :             s += 4;
    4784             402 :             d += 4;
    4785                 :         }
    4786                 : 
    4787             993 :         if (w >= 2)
    4788                 :         {
    4789               0 :             *(uint16_t *)d = *(uint16_t *)s;
    4790               0 :             w -= 2;
    4791               0 :             s += 2;
    4792               0 :             d += 2;
    4793                 :         }
    4794                 :     }
    4795                 : 
    4796                 : 
    4797              27 :     return TRUE;
    4798                 : }
    4799                 : 
    4800                 : static void
    4801              27 : sse2_composite_copy_area (pixman_implementation_t *imp,
    4802                 :                           pixman_op_t              op,
    4803                 :                           pixman_image_t *         src_image,
    4804                 :                           pixman_image_t *         mask_image,
    4805                 :                           pixman_image_t *         dst_image,
    4806                 :                           int32_t                  src_x,
    4807                 :                           int32_t                  src_y,
    4808                 :                           int32_t                  mask_x,
    4809                 :                           int32_t                  mask_y,
    4810                 :                           int32_t                  dest_x,
    4811                 :                           int32_t                  dest_y,
    4812                 :                           int32_t                  width,
    4813                 :                           int32_t                  height)
    4814                 : {
    4815              54 :     pixman_blt_sse2 (src_image->bits.bits,
    4816                 :                      dst_image->bits.bits,
    4817                 :                      src_image->bits.rowstride,
    4818                 :                      dst_image->bits.rowstride,
    4819              27 :                      PIXMAN_FORMAT_BPP (src_image->bits.format),
    4820              27 :                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
    4821                 :                      src_x, src_y, dest_x, dest_y, width, height);
    4822              27 : }
    4823                 : 
    4824                 : static void
    4825               0 : sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
    4826                 :                                  pixman_op_t              op,
    4827                 :                                  pixman_image_t *         src_image,
    4828                 :                                  pixman_image_t *         mask_image,
    4829                 :                                  pixman_image_t *         dst_image,
    4830                 :                                  int32_t                  src_x,
    4831                 :                                  int32_t                  src_y,
    4832                 :                                  int32_t                  mask_x,
    4833                 :                                  int32_t                  mask_y,
    4834                 :                                  int32_t                  dest_x,
    4835                 :                                  int32_t                  dest_y,
    4836                 :                                  int32_t                  width,
    4837                 :                                  int32_t                  height)
    4838                 : {
    4839                 :     uint32_t    *src, *src_line, s;
    4840                 :     uint32_t    *dst, *dst_line, d;
    4841                 :     uint8_t         *mask, *mask_line;
    4842                 :     uint32_t m;
    4843                 :     int src_stride, mask_stride, dst_stride;
    4844                 :     int32_t w;
    4845                 :     __m128i ms;
    4846                 : 
    4847                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    4848                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4849                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4850                 : 
    4851               0 :     PIXMAN_IMAGE_GET_LINE (
    4852                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4853               0 :     PIXMAN_IMAGE_GET_LINE (
    4854                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4855               0 :     PIXMAN_IMAGE_GET_LINE (
    4856                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4857                 : 
    4858               0 :     while (height--)
    4859                 :     {
    4860               0 :         src = src_line;
    4861               0 :         src_line += src_stride;
    4862               0 :         dst = dst_line;
    4863               0 :         dst_line += dst_stride;
    4864               0 :         mask = mask_line;
    4865               0 :         mask_line += mask_stride;
    4866                 : 
    4867               0 :         w = width;
    4868                 : 
    4869               0 :         while (w && (unsigned long)dst & 15)
    4870                 :         {
    4871               0 :             s = 0xff000000 | *src++;
    4872               0 :             m = (uint32_t) *mask++;
    4873               0 :             d = *dst;
    4874               0 :             ms = unpack_32_1x128 (s);
    4875                 : 
    4876               0 :             if (m != 0xff)
    4877                 :             {
    4878               0 :                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    4879               0 :                 __m128i md = unpack_32_1x128 (d);
    4880                 : 
    4881               0 :                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
    4882                 :             }
    4883                 : 
    4884               0 :             *dst++ = pack_1x128_32 (ms);
    4885               0 :             w--;
    4886                 :         }
    4887                 : 
    4888               0 :         while (w >= 4)
    4889                 :         {
    4890               0 :             m = *(uint32_t*) mask;
    4891               0 :             xmm_src = _mm_or_si128 (
    4892                 :                 load_128_unaligned ((__m128i*)src), mask_ff000000);
    4893                 : 
    4894               0 :             if (m == 0xffffffff)
    4895                 :             {
    4896               0 :                 save_128_aligned ((__m128i*)dst, xmm_src);
    4897                 :             }
    4898                 :             else
    4899                 :             {
    4900               0 :                 xmm_dst = load_128_aligned ((__m128i*)dst);
    4901                 : 
    4902               0 :                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
    4903                 : 
    4904               0 :                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    4905               0 :                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    4906               0 :                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    4907                 : 
    4908               0 :                 expand_alpha_rev_2x128 (
    4909                 :                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    4910                 : 
    4911                 :                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    4912                 :                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
    4913                 :                                &xmm_dst_lo, &xmm_dst_hi);
    4914                 : 
    4915               0 :                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    4916                 :             }
    4917                 : 
    4918               0 :             src += 4;
    4919               0 :             dst += 4;
    4920               0 :             mask += 4;
    4921               0 :             w -= 4;
    4922                 :         }
    4923                 : 
    4924               0 :         while (w)
    4925                 :         {
    4926               0 :             m = (uint32_t) *mask++;
    4927                 : 
    4928               0 :             if (m)
    4929                 :             {
    4930               0 :                 s = 0xff000000 | *src;
    4931                 : 
    4932               0 :                 if (m == 0xff)
    4933                 :                 {
    4934               0 :                     *dst = s;
    4935                 :                 }
    4936                 :                 else
    4937                 :                 {
    4938                 :                     __m128i ma, md, ms;
    4939                 : 
    4940               0 :                     d = *dst;
    4941                 : 
    4942               0 :                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
    4943               0 :                     md = unpack_32_1x128 (d);
    4944               0 :                     ms = unpack_32_1x128 (s);
    4945                 : 
    4946               0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
    4947                 :                 }
    4948                 : 
    4949                 :             }
    4950                 : 
    4951               0 :             src++;
    4952               0 :             dst++;
    4953               0 :             w--;
    4954                 :         }
    4955                 :     }
    4956                 : 
    4957               0 : }
    4958                 : 
    4959                 : static void
    4960               0 : sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
    4961                 :                                  pixman_op_t              op,
    4962                 :                                  pixman_image_t *         src_image,
    4963                 :                                  pixman_image_t *         mask_image,
    4964                 :                                  pixman_image_t *         dst_image,
    4965                 :                                  int32_t                  src_x,
    4966                 :                                  int32_t                  src_y,
    4967                 :                                  int32_t                  mask_x,
    4968                 :                                  int32_t                  mask_y,
    4969                 :                                  int32_t                  dest_x,
    4970                 :                                  int32_t                  dest_y,
    4971                 :                                  int32_t                  width,
    4972                 :                                  int32_t                  height)
    4973                 : {
    4974                 :     uint32_t    *src, *src_line, s;
    4975                 :     uint32_t    *dst, *dst_line, d;
    4976                 :     uint8_t         *mask, *mask_line;
    4977                 :     uint32_t m;
    4978                 :     int src_stride, mask_stride, dst_stride;
    4979                 :     int32_t w;
    4980                 : 
    4981                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
    4982                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    4983                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    4984                 : 
    4985               0 :     PIXMAN_IMAGE_GET_LINE (
    4986                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    4987               0 :     PIXMAN_IMAGE_GET_LINE (
    4988                 :         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
    4989               0 :     PIXMAN_IMAGE_GET_LINE (
    4990                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    4991                 : 
    4992               0 :     while (height--)
    4993                 :     {
    4994               0 :         src = src_line;
    4995               0 :         src_line += src_stride;
    4996               0 :         dst = dst_line;
    4997               0 :         dst_line += dst_stride;
    4998               0 :         mask = mask_line;
    4999               0 :         mask_line += mask_stride;
    5000                 : 
    5001               0 :         w = width;
    5002                 : 
    5003               0 :         while (w && (unsigned long)dst & 15)
    5004                 :         {
    5005                 :             uint32_t sa;
    5006                 : 
    5007               0 :             s = *src++;
    5008               0 :             m = (uint32_t) *mask++;
    5009               0 :             d = *dst;
    5010                 : 
    5011               0 :             sa = s >> 24;
    5012                 : 
    5013               0 :             if (m)
    5014                 :             {
    5015               0 :                 if (sa == 0xff && m == 0xff)
    5016                 :                 {
    5017               0 :                     *dst = s;
    5018                 :                 }
    5019                 :                 else
    5020                 :                 {
    5021                 :                     __m128i ms, md, ma, msa;
    5022                 : 
    5023               0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5024               0 :                     ms = unpack_32_1x128 (s);
    5025               0 :                     md = unpack_32_1x128 (d);
    5026                 : 
    5027               0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5028                 : 
    5029               0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5030                 :                 }
    5031                 :             }
    5032                 : 
    5033               0 :             dst++;
    5034               0 :             w--;
    5035                 :         }
    5036                 : 
    5037               0 :         while (w >= 4)
    5038                 :         {
    5039               0 :             m = *(uint32_t *) mask;
    5040                 : 
    5041               0 :             if (m)
    5042                 :             {
    5043               0 :                 xmm_src = load_128_unaligned ((__m128i*)src);
    5044                 : 
    5045               0 :                 if (m == 0xffffffff && is_opaque (xmm_src))
    5046                 :                 {
    5047               0 :                     save_128_aligned ((__m128i *)dst, xmm_src);
    5048                 :                 }
    5049                 :                 else
    5050                 :                 {
    5051               0 :                     xmm_dst = load_128_aligned ((__m128i *)dst);
    5052                 : 
    5053               0 :                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
    5054                 : 
    5055               0 :                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5056               0 :                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    5057               0 :                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5058                 : 
    5059               0 :                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
    5060               0 :                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    5061                 : 
    5062                 :                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
    5063                 :                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
    5064                 : 
    5065               0 :                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5066                 :                 }
    5067                 :             }
    5068                 : 
    5069               0 :             src += 4;
    5070               0 :             dst += 4;
    5071               0 :             mask += 4;
    5072               0 :             w -= 4;
    5073                 :         }
    5074                 : 
    5075               0 :         while (w)
    5076                 :         {
    5077                 :             uint32_t sa;
    5078                 : 
    5079               0 :             s = *src++;
    5080               0 :             m = (uint32_t) *mask++;
    5081               0 :             d = *dst;
    5082                 : 
    5083               0 :             sa = s >> 24;
    5084                 : 
    5085               0 :             if (m)
    5086                 :             {
    5087               0 :                 if (sa == 0xff && m == 0xff)
    5088                 :                 {
    5089               0 :                     *dst = s;
    5090                 :                 }
    5091                 :                 else
    5092                 :                 {
    5093                 :                     __m128i ms, md, ma, msa;
    5094                 : 
    5095               0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5096               0 :                     ms = unpack_32_1x128 (s);
    5097               0 :                     md = unpack_32_1x128 (d);
    5098                 : 
    5099               0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5100                 : 
    5101               0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5102                 :                 }
    5103                 :             }
    5104                 : 
    5105               0 :             dst++;
    5106               0 :             w--;
    5107                 :         }
    5108                 :     }
    5109                 : 
    5110               0 : }
    5111                 : 
    5112                 : static void
    5113               0 : sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
    5114                 :                                     pixman_op_t              op,
    5115                 :                                     pixman_image_t *         src_image,
    5116                 :                                     pixman_image_t *         mask_image,
    5117                 :                                     pixman_image_t *         dst_image,
    5118                 :                                     int32_t                  src_x,
    5119                 :                                     int32_t                  src_y,
    5120                 :                                     int32_t                  mask_x,
    5121                 :                                     int32_t                  mask_y,
    5122                 :                                     int32_t                  dest_x,
    5123                 :                                     int32_t                  dest_y,
    5124                 :                                     int32_t                  width,
    5125                 :                                     int32_t                  height)
    5126                 : {
    5127                 :     uint32_t src;
    5128                 :     uint32_t    *dst_line, *dst;
    5129                 :     __m128i xmm_src;
    5130                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5131                 :     __m128i xmm_dsta_hi, xmm_dsta_lo;
    5132                 :     int dst_stride;
    5133                 :     int32_t w;
    5134                 : 
    5135               0 :     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
    5136                 : 
    5137               0 :     if (src == 0)
    5138               0 :         return;
    5139                 : 
    5140               0 :     PIXMAN_IMAGE_GET_LINE (
    5141                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    5142                 : 
    5143               0 :     xmm_src = expand_pixel_32_1x128 (src);
    5144                 : 
    5145               0 :     while (height--)
    5146                 :     {
    5147               0 :         dst = dst_line;
    5148                 : 
    5149               0 :         dst_line += dst_stride;
    5150               0 :         w = width;
    5151                 : 
    5152               0 :         while (w && (unsigned long)dst & 15)
    5153                 :         {
    5154                 :             __m128i vd;
    5155                 : 
    5156               0 :             vd = unpack_32_1x128 (*dst);
    5157                 : 
    5158               0 :             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
    5159                 :                                               xmm_src));
    5160               0 :             w--;
    5161               0 :             dst++;
    5162                 :         }
    5163                 : 
    5164               0 :         while (w >= 4)
    5165                 :         {
    5166                 :             __m128i tmp_lo, tmp_hi;
    5167                 : 
    5168               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    5169                 : 
    5170               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5171               0 :             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
    5172                 : 
    5173               0 :             tmp_lo = xmm_src;
    5174               0 :             tmp_hi = xmm_src;
    5175                 : 
    5176                 :             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    5177                 :                         &xmm_dsta_lo, &xmm_dsta_hi,
    5178                 :                         &tmp_lo, &tmp_hi);
    5179                 : 
    5180               0 :             save_128_aligned (
    5181                 :                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
    5182                 : 
    5183               0 :             w -= 4;
    5184               0 :             dst += 4;
    5185                 :         }
    5186                 : 
    5187               0 :         while (w)
    5188                 :         {
    5189                 :             __m128i vd;
    5190                 : 
    5191               0 :             vd = unpack_32_1x128 (*dst);
    5192                 : 
    5193               0 :             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
    5194                 :                                               xmm_src));
    5195               0 :             w--;
    5196               0 :             dst++;
    5197                 :         }
    5198                 : 
    5199                 :     }
    5200                 : 
    5201                 : }
    5202                 : 
    5203                 : static void
    5204               0 : sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
    5205                 :                                     pixman_op_t              op,
    5206                 :                                     pixman_image_t *         src_image,
    5207                 :                                     pixman_image_t *         mask_image,
    5208                 :                                     pixman_image_t *         dst_image,
    5209                 :                                     int32_t                  src_x,
    5210                 :                                     int32_t                  src_y,
    5211                 :                                     int32_t                  mask_x,
    5212                 :                                     int32_t                  mask_y,
    5213                 :                                     int32_t                  dest_x,
    5214                 :                                     int32_t                  dest_y,
    5215                 :                                     int32_t                  width,
    5216                 :                                     int32_t                  height)
    5217                 : {
    5218                 :     uint32_t    *src, *src_line, s;
    5219                 :     uint32_t    *dst, *dst_line, d;
    5220                 :     uint32_t    *mask, *mask_line;
    5221                 :     uint32_t    m;
    5222                 :     int src_stride, mask_stride, dst_stride;
    5223                 :     int32_t w;
    5224                 : 
    5225                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
    5226                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5227                 :     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
    5228                 : 
    5229               0 :     PIXMAN_IMAGE_GET_LINE (
    5230                 :         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
    5231               0 :     PIXMAN_IMAGE_GET_LINE (
    5232                 :         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
    5233               0 :     PIXMAN_IMAGE_GET_LINE (
    5234                 :         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
    5235                 : 
    5236               0 :     while (height--)
    5237                 :     {
    5238               0 :         src = src_line;
    5239               0 :         src_line += src_stride;
    5240               0 :         dst = dst_line;
    5241               0 :         dst_line += dst_stride;
    5242               0 :         mask = mask_line;
    5243               0 :         mask_line += mask_stride;
    5244                 : 
    5245               0 :         w = width;
    5246                 : 
    5247               0 :         while (w && (unsigned long)dst & 15)
    5248                 :         {
    5249                 :             uint32_t sa;
    5250                 : 
    5251               0 :             s = *src++;
    5252               0 :             m = (*mask++) >> 24;
    5253               0 :             d = *dst;
    5254                 : 
    5255               0 :             sa = s >> 24;
    5256                 : 
    5257               0 :             if (m)
    5258                 :             {
    5259               0 :                 if (sa == 0xff && m == 0xff)
    5260                 :                 {
    5261               0 :                     *dst = s;
    5262                 :                 }
    5263                 :                 else
    5264                 :                 {
    5265                 :                     __m128i ms, md, ma, msa;
    5266                 : 
    5267               0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5268               0 :                     ms = unpack_32_1x128 (s);
    5269               0 :                     md = unpack_32_1x128 (d);
    5270                 : 
    5271               0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5272                 : 
    5273               0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5274                 :                 }
    5275                 :             }
    5276                 : 
    5277               0 :             dst++;
    5278               0 :             w--;
    5279                 :         }
    5280                 : 
    5281               0 :         while (w >= 4)
    5282                 :         {
    5283               0 :             xmm_mask = load_128_unaligned ((__m128i*)mask);
    5284                 : 
    5285               0 :             if (!is_transparent (xmm_mask))
    5286                 :             {
    5287               0 :                 xmm_src = load_128_unaligned ((__m128i*)src);
    5288                 : 
    5289               0 :                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
    5290                 :                 {
    5291               0 :                     save_128_aligned ((__m128i *)dst, xmm_src);
    5292                 :                 }
    5293                 :                 else
    5294                 :                 {
    5295               0 :                     xmm_dst = load_128_aligned ((__m128i *)dst);
    5296                 : 
    5297               0 :                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5298               0 :                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
    5299               0 :                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5300                 : 
    5301               0 :                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
    5302               0 :                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
    5303                 : 
    5304                 :                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
    5305                 :                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
    5306                 : 
    5307               0 :                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5308                 :                 }
    5309                 :             }
    5310                 : 
    5311               0 :             src += 4;
    5312               0 :             dst += 4;
    5313               0 :             mask += 4;
    5314               0 :             w -= 4;
    5315                 :         }
    5316                 : 
    5317               0 :         while (w)
    5318                 :         {
    5319                 :             uint32_t sa;
    5320                 : 
    5321               0 :             s = *src++;
    5322               0 :             m = (*mask++) >> 24;
    5323               0 :             d = *dst;
    5324                 : 
    5325               0 :             sa = s >> 24;
    5326                 : 
    5327               0 :             if (m)
    5328                 :             {
    5329               0 :                 if (sa == 0xff && m == 0xff)
    5330                 :                 {
    5331               0 :                     *dst = s;
    5332                 :                 }
    5333                 :                 else
    5334                 :                 {
    5335                 :                     __m128i ms, md, ma, msa;
    5336                 : 
    5337               0 :                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
    5338               0 :                     ms = unpack_32_1x128 (s);
    5339               0 :                     md = unpack_32_1x128 (d);
    5340                 : 
    5341               0 :                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
    5342                 : 
    5343               0 :                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
    5344                 :                 }
    5345                 :             }
    5346                 : 
    5347               0 :             dst++;
    5348               0 :             w--;
    5349                 :         }
    5350                 :     }
    5351                 : 
    5352               0 : }
    5353                 : 
    5354                 : /* A variant of 'sse2_combine_over_u' with minor tweaks */
    5355                 : static force_inline void
    5356                 : scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
    5357                 :                                              const uint32_t* ps,
    5358                 :                                              int32_t         w,
    5359                 :                                              pixman_fixed_t  vx,
    5360                 :                                              pixman_fixed_t  unit_x,
    5361                 :                                              pixman_fixed_t  max_vx,
    5362                 :                                              pixman_bool_t   fully_transparent_src)
    5363                 : {
    5364                 :     uint32_t s, d;
    5365               0 :     const uint32_t* pm = NULL;
    5366                 : 
    5367                 :     __m128i xmm_dst_lo, xmm_dst_hi;
    5368                 :     __m128i xmm_src_lo, xmm_src_hi;
    5369                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    5370                 : 
    5371               0 :     if (fully_transparent_src)
    5372                 :         return;
    5373                 : 
    5374                 :     /* Align dst on a 16-byte boundary */
    5375               0 :     while (w && ((unsigned long)pd & 15))
    5376                 :     {
    5377               0 :         d = *pd;
    5378               0 :         s = combine1 (ps + (vx >> 16), pm);
    5379               0 :         vx += unit_x;
    5380                 : 
    5381               0 :         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
    5382               0 :         if (pm)
    5383               0 :             pm++;
    5384               0 :         w--;
    5385                 :     }
    5386                 : 
    5387               0 :     while (w >= 4)
    5388                 :     {
    5389                 :         __m128i tmp;
    5390                 :         uint32_t tmp1, tmp2, tmp3, tmp4;
    5391                 : 
    5392               0 :         tmp1 = ps[vx >> 16];
    5393               0 :         vx += unit_x;
    5394               0 :         tmp2 = ps[vx >> 16];
    5395               0 :         vx += unit_x;
    5396               0 :         tmp3 = ps[vx >> 16];
    5397               0 :         vx += unit_x;
    5398               0 :         tmp4 = ps[vx >> 16];
    5399               0 :         vx += unit_x;
    5400                 : 
    5401               0 :         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
    5402                 : 
    5403               0 :         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
    5404                 : 
    5405               0 :         if (is_opaque (xmm_src_hi))
    5406                 :         {
    5407               0 :             save_128_aligned ((__m128i*)pd, xmm_src_hi);
    5408                 :         }
    5409               0 :         else if (!is_zero (xmm_src_hi))
    5410                 :         {
    5411               0 :             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    5412                 : 
    5413               0 :             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    5414               0 :             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    5415                 : 
    5416               0 :             expand_alpha_2x128 (
    5417                 :                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
    5418                 : 
    5419                 :             over_2x128 (&xmm_src_lo, &xmm_src_hi,
    5420                 :                         &xmm_alpha_lo, &xmm_alpha_hi,
    5421                 :                         &xmm_dst_lo, &xmm_dst_hi);
    5422                 : 
    5423                 :             /* rebuid the 4 pixel data and save*/
    5424               0 :             save_128_aligned ((__m128i*)pd,
    5425                 :                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5426                 :         }
    5427                 : 
    5428               0 :         w -= 4;
    5429               0 :         pd += 4;
    5430               0 :         if (pm)
    5431               0 :             pm += 4;
    5432                 :     }
    5433                 : 
    5434               0 :     while (w)
    5435                 :     {
    5436               0 :         d = *pd;
    5437               0 :         s = combine1 (ps + (vx >> 16), pm);
    5438               0 :         vx += unit_x;
    5439                 : 
    5440               0 :         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
    5441               0 :         if (pm)
    5442               0 :             pm++;
    5443                 : 
    5444               0 :         w--;
    5445                 :     }
    5446                 : }
    5447                 : 
    5448               0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
    5449                 :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5450               0 :                        uint32_t, uint32_t, COVER)
    5451               0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
    5452                 :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5453               0 :                        uint32_t, uint32_t, NONE)
    5454               0 : FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
    5455                 :                        scaled_nearest_scanline_sse2_8888_8888_OVER,
    5456               0 :                        uint32_t, uint32_t, PAD)
    5457                 : 
    5458                 : static force_inline void
    5459                 : scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
    5460                 :                                                uint32_t *       dst,
    5461                 :                                                const uint32_t * src,
    5462                 :                                                int32_t          w,
    5463                 :                                                pixman_fixed_t   vx,
    5464                 :                                                pixman_fixed_t   unit_x,
    5465                 :                                                pixman_fixed_t   max_vx,
    5466                 :                                                pixman_bool_t    zero_src)
    5467                 : {
    5468                 :     __m128i xmm_mask;
    5469                 :     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
    5470                 :     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
    5471                 :     __m128i xmm_alpha_lo, xmm_alpha_hi;
    5472                 : 
    5473               0 :     if (zero_src || (*mask >> 24) == 0)
    5474                 :         return;
    5475                 : 
    5476               0 :     xmm_mask = create_mask_16_128 (*mask >> 24);
    5477                 : 
    5478               0 :     while (w && (unsigned long)dst & 15)
    5479                 :     {
    5480               0 :         uint32_t s = src[pixman_fixed_to_int (vx)];
    5481               0 :         vx += unit_x;
    5482                 : 
    5483               0 :         if (s)
    5484                 :         {
    5485               0 :             uint32_t d = *dst;
    5486                 : 
    5487               0 :             __m128i ms = unpack_32_1x128 (s);
    5488               0 :             __m128i alpha     = expand_alpha_1x128 (ms);
    5489               0 :             __m128i dest      = xmm_mask;
    5490               0 :             __m128i alpha_dst = unpack_32_1x128 (d);
    5491                 : 
    5492               0 :             *dst = pack_1x128_32 (
    5493                 :                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
    5494                 :         }
    5495               0 :         dst++;
    5496               0 :         w--;
    5497                 :     }
    5498                 : 
    5499               0 :     while (w >= 4)
    5500                 :     {
    5501                 :         uint32_t tmp1, tmp2, tmp3, tmp4;
    5502                 : 
    5503               0 :         tmp1 = src[pixman_fixed_to_int (vx)];
    5504               0 :         vx += unit_x;
    5505               0 :         tmp2 = src[pixman_fixed_to_int (vx)];
    5506               0 :         vx += unit_x;
    5507               0 :         tmp3 = src[pixman_fixed_to_int (vx)];
    5508               0 :         vx += unit_x;
    5509               0 :         tmp4 = src[pixman_fixed_to_int (vx)];
    5510               0 :         vx += unit_x;
    5511                 : 
    5512               0 :         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
    5513                 : 
    5514               0 :         if (!is_zero (xmm_src))
    5515                 :         {
    5516               0 :             xmm_dst = load_128_aligned ((__m128i*)dst);
    5517                 : 
    5518               0 :             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
    5519               0 :             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
    5520               0 :             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
    5521                 :                                 &xmm_alpha_lo, &xmm_alpha_hi);
    5522                 : 
    5523                 :             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
    5524                 :                            &xmm_alpha_lo, &xmm_alpha_hi,
    5525                 :                            &xmm_mask, &xmm_mask,
    5526                 :                            &xmm_dst_lo, &xmm_dst_hi);
    5527                 : 
    5528               0 :             save_128_aligned (
    5529                 :                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    5530                 :         }
    5531                 : 
    5532               0 :         dst += 4;
    5533               0 :         w -= 4;
    5534                 :     }
    5535                 : 
    5536               0 :     while (w)
    5537                 :     {
    5538               0 :         uint32_t s = src[pixman_fixed_to_int (vx)];
    5539               0 :         vx += unit_x;
    5540                 : 
    5541               0 :         if (s)
    5542                 :         {
    5543               0 :             uint32_t d = *dst;
    5544                 : 
    5545               0 :             __m128i ms = unpack_32_1x128 (s);
    5546               0 :             __m128i alpha = expand_alpha_1x128 (ms);
    5547               0 :             __m128i mask  = xmm_mask;
    5548               0 :             __m128i dest  = unpack_32_1x128 (d);
    5549                 : 
    5550               0 :             *dst = pack_1x128_32 (
    5551                 :                 in_over_1x128 (&ms, &alpha, &mask, &dest));
    5552                 :         }
    5553                 : 
    5554               0 :         dst++;
    5555               0 :         w--;
    5556                 :     }
    5557                 : 
    5558                 : }
    5559                 : 
    5560               0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
    5561                 :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5562               0 :                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
    5563               0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
    5564                 :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5565               0 :                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
    5566               0 : FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
    5567                 :                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
    5568               0 :                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
    5569                 : 
    5570                 : static void
    5571             304 : bilinear_interpolate_line_sse2 (uint32_t *       out,
    5572                 :                                 const uint32_t * top,
    5573                 :                                 const uint32_t * bottom,
    5574                 :                                 int              wt,
    5575                 :                                 int              wb,
    5576                 :                                 pixman_fixed_t   x,
    5577                 :                                 pixman_fixed_t   ux,
    5578                 :                                 int              width)
    5579                 : {
    5580             608 :     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
    5581             608 :     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
    5582             304 :     const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
    5583             304 :     const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
    5584             608 :     const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
    5585             304 :     const __m128i xmm_zero = _mm_setzero_si128 ();
    5586             608 :     __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
    5587                 :     uint32_t pix1, pix2, pix3, pix4;
    5588                 : 
    5589                 :     #define INTERPOLATE_ONE_PIXEL(pix)                                          \
    5590                 :     do {                                                                        \
    5591                 :         __m128i xmm_wh, xmm_lo, xmm_hi, a;                                      \
    5592                 :         /* fetch 2x2 pixel block into sse2 register */                          \
    5593                 :         uint32_t tl = top [pixman_fixed_to_int (x)];                            \
    5594                 :         uint32_t tr = top [pixman_fixed_to_int (x) + 1];                        \
    5595                 :         uint32_t bl = bottom [pixman_fixed_to_int (x)];                         \
    5596                 :         uint32_t br = bottom [pixman_fixed_to_int (x) + 1];                     \
    5597                 :         a = _mm_set_epi32 (tr, tl, br, bl);                                     \
    5598                 :         x += ux;                                                                \
    5599                 :         /* vertical interpolation */                                            \
    5600                 :         a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),    \
    5601                 :                                             xmm_wt),                            \
    5602                 :                            _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),    \
    5603                 :                                             xmm_wb));                           \
    5604                 :         /* calculate horizontal weights */                                      \
    5605                 :         xmm_wh = _mm_add_epi16 (xmm_addc,                                       \
    5606                 :                                 _mm_xor_si128 (xmm_xorc,                        \
    5607                 :                                                _mm_srli_epi16 (xmm_x, 8)));     \
    5608                 :         xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  \
    5609                 :         /* horizontal interpolation */                                          \
    5610                 :         xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   \
    5611                 :         xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                   \
    5612                 :         a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                 \
    5613                 :                            _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                \
    5614                 :         /* shift and pack the result */                                         \
    5615                 :         a = _mm_srli_epi32 (a, 16);                                             \
    5616                 :         a = _mm_packs_epi32 (a, a);                                             \
    5617                 :         a = _mm_packus_epi16 (a, a);                                            \
    5618                 :         pix = _mm_cvtsi128_si32 (a);                                            \
    5619                 :     } while (0)
    5620                 : 
    5621            1360 :     while ((width -= 4) >= 0)
    5622                 :     {
    5623           12032 :         INTERPOLATE_ONE_PIXEL (pix1);
    5624           12032 :         INTERPOLATE_ONE_PIXEL (pix2);
    5625           12032 :         INTERPOLATE_ONE_PIXEL (pix3);
    5626           12032 :         INTERPOLATE_ONE_PIXEL (pix4);
    5627             752 :         *out++ = pix1;
    5628             752 :         *out++ = pix2;
    5629             752 :         *out++ = pix3;
    5630             752 :         *out++ = pix4;
    5631                 :     }
    5632             304 :     if (width & 2)
    5633                 :     {
    5634            1792 :         INTERPOLATE_ONE_PIXEL (pix1);
    5635            1792 :         INTERPOLATE_ONE_PIXEL (pix2);
    5636             112 :         *out++ = pix1;
    5637             112 :         *out++ = pix2;
    5638                 :     }
    5639             304 :     if (width & 1)
    5640                 :     {
    5641            1536 :         INTERPOLATE_ONE_PIXEL (pix1);
    5642              96 :         *out = pix1;
    5643                 :     }
    5644                 : 
    5645                 :     #undef INTERPOLATE_ONE_PIXEL
    5646             304 : }
    5647                 : 
    5648                 : static force_inline void
    5649                 : scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
    5650                 :                                              const uint32_t * mask,
    5651                 :                                              const uint32_t * src_top,
    5652                 :                                              const uint32_t * src_bottom,
    5653                 :                                              int32_t          w,
    5654                 :                                              int              wt,
    5655                 :                                              int              wb,
    5656                 :                                              pixman_fixed_t   vx,
    5657                 :                                              pixman_fixed_t   unit_x,
    5658                 :                                              pixman_fixed_t   max_vx,
    5659                 :                                              pixman_bool_t    zero_src)
    5660                 : {
    5661             304 :     bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
    5662                 :                                     wt, wb, vx, unit_x, w);
    5663                 : }
    5664                 : 
    5665               6 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
    5666                 :                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
    5667                 :                                uint32_t, uint32_t, uint32_t,
    5668             114 :                                COVER, FALSE, FALSE)
    5669               0 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
    5670                 :                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
    5671                 :                                uint32_t, uint32_t, uint32_t,
    5672               0 :                                PAD, FALSE, FALSE)
    5673             300 : FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
    5674                 :                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
    5675                 :                                uint32_t, uint32_t, uint32_t,
    5676               4 :                                NONE, FALSE, FALSE)
    5677                 : 
    5678                 : static const pixman_fast_path_t sse2_fast_paths[] =
    5679                 : {
    5680                 :     /* PIXMAN_OP_OVER */
    5681                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
    5682                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
    5683                 :     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
    5684                 :     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
    5685                 :     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
    5686                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
    5687                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
    5688                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
    5689                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
    5690                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
    5691                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
    5692                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
    5693                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
    5694                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
    5695                 :     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
    5696                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
    5697                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
    5698                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
    5699                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
    5700                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
    5701                 :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
    5702                 :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
    5703                 :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
    5704                 :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
    5705                 :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
    5706                 :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
    5707                 :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
    5708                 :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
    5709                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
    5710                 :     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
    5711                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
    5712                 :     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
    5713                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
    5714                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
    5715                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
    5716                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
    5717                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
    5718                 :     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
    5719                 :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
    5720                 :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
    5721                 :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
    5722                 :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
    5723                 :     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
    5724                 :     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
    5725                 :     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    5726                 :     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    5727                 :     
    5728                 :     /* PIXMAN_OP_OVER_REVERSE */
    5729                 :     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
    5730                 :     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
    5731                 : 
    5732                 :     /* PIXMAN_OP_ADD */
    5733                 :     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
    5734                 :     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
    5735                 :     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
    5736                 :     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
    5737                 :     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
    5738                 :     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
    5739                 : 
    5740                 :     /* PIXMAN_OP_SRC */
    5741                 :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
    5742                 :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
    5743                 :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
    5744                 :     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
    5745                 :     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
    5746                 :     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
    5747                 :     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
    5748                 :     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
    5749                 :     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    5750                 :     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    5751                 :     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
    5752                 :     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
    5753                 :     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
    5754                 :     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
    5755                 : 
    5756                 :     /* PIXMAN_OP_IN */
    5757                 :     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
    5758                 :     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
    5759                 :     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
    5760                 : 
    5761                 :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    5762                 :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    5763                 :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    5764                 :     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    5765                 :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    5766                 :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    5767                 :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    5768                 :     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    5769                 :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    5770                 :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
    5771                 :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    5772                 :     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
    5773                 : 
    5774                 :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
    5775                 :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
    5776                 :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
    5777                 :     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
    5778                 : 
    5779                 :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
    5780                 :     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
    5781                 :     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
    5782                 : 
    5783                 :     { PIXMAN_OP_NONE },
    5784                 : };
    5785                 : 
    5786                 : static pixman_bool_t
    5787               0 : sse2_blt (pixman_implementation_t *imp,
    5788                 :           uint32_t *               src_bits,
    5789                 :           uint32_t *               dst_bits,
    5790                 :           int                      src_stride,
    5791                 :           int                      dst_stride,
    5792                 :           int                      src_bpp,
    5793                 :           int                      dst_bpp,
    5794                 :           int                      src_x,
    5795                 :           int                      src_y,
    5796                 :           int                      dst_x,
    5797                 :           int                      dst_y,
    5798                 :           int                      width,
    5799                 :           int                      height)
    5800                 : {
    5801               0 :     if (!pixman_blt_sse2 (
    5802                 :             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
    5803                 :             src_x, src_y, dst_x, dst_y, width, height))
    5804                 : 
    5805                 :     {
    5806               0 :         return _pixman_implementation_blt (
    5807                 :             imp->delegate,
    5808                 :             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
    5809                 :             src_x, src_y, dst_x, dst_y, width, height);
    5810                 :     }
    5811                 : 
    5812               0 :     return TRUE;
    5813                 : }
    5814                 : 
    5815                 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
    5816                 : __attribute__((__force_align_arg_pointer__))
    5817                 : #endif
    5818                 : static pixman_bool_t
    5819              17 : sse2_fill (pixman_implementation_t *imp,
    5820                 :            uint32_t *               bits,
    5821                 :            int                      stride,
    5822                 :            int                      bpp,
    5823                 :            int                      x,
    5824                 :            int                      y,
    5825                 :            int                      width,
    5826                 :            int                      height,
    5827                 :            uint32_t xor)
    5828                 : {
    5829              17 :     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
    5830                 :     {
    5831               0 :         return _pixman_implementation_fill (
    5832                 :             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
    5833                 :     }
    5834                 : 
    5835              17 :     return TRUE;
    5836                 : }
    5837                 : 
    5838                 : static uint32_t *
    5839               0 : sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
    5840                 : {
    5841               0 :     int w = iter->width;
    5842               0 :     __m128i ff000000 = mask_ff000000;
    5843               0 :     uint32_t *dst = iter->buffer;
    5844               0 :     uint32_t *src = (uint32_t *)iter->bits;
    5845                 : 
    5846               0 :     iter->bits += iter->stride;
    5847                 : 
    5848               0 :     while (w && ((unsigned long)dst) & 0x0f)
    5849                 :     {
    5850               0 :         *dst++ = (*src++) | 0xff000000;
    5851               0 :         w--;
    5852                 :     }
    5853                 : 
    5854               0 :     while (w >= 4)
    5855                 :     {
    5856               0 :         save_128_aligned (
    5857                 :             (__m128i *)dst, _mm_or_si128 (
    5858                 :                 load_128_unaligned ((__m128i *)src), ff000000));
    5859                 : 
    5860               0 :         dst += 4;
    5861               0 :         src += 4;
    5862               0 :         w -= 4;
    5863                 :     }
    5864                 : 
    5865               0 :     while (w)
    5866                 :     {
    5867               0 :         *dst++ = (*src++) | 0xff000000;
    5868               0 :         w--;
    5869                 :     }
    5870                 : 
    5871               0 :     return iter->buffer;
    5872                 : }
    5873                 : 
    5874                 : static uint32_t *
    5875               0 : sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
    5876                 : {
    5877               0 :     int w = iter->width;
    5878               0 :     uint32_t *dst = iter->buffer;
    5879               0 :     uint16_t *src = (uint16_t *)iter->bits;
    5880               0 :     __m128i ff000000 = mask_ff000000;
    5881                 : 
    5882               0 :     iter->bits += iter->stride;
    5883                 : 
    5884               0 :     while (w && ((unsigned long)dst) & 0x0f)
    5885                 :     {
    5886               0 :         uint16_t s = *src++;
    5887                 : 
    5888               0 :         *dst++ = CONVERT_0565_TO_8888 (s);
    5889               0 :         w--;
    5890                 :     }
    5891                 : 
    5892               0 :     while (w >= 8)
    5893                 :     {
    5894                 :         __m128i lo, hi, s;
    5895                 : 
    5896               0 :         s = _mm_loadu_si128 ((__m128i *)src);
    5897                 : 
    5898               0 :         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
    5899               0 :         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
    5900                 : 
    5901               0 :         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
    5902               0 :         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
    5903                 : 
    5904               0 :         dst += 8;
    5905               0 :         src += 8;
    5906               0 :         w -= 8;
    5907                 :     }
    5908                 : 
    5909               0 :     while (w)
    5910                 :     {
    5911               0 :         uint16_t s = *src++;
    5912                 : 
    5913               0 :         *dst++ = CONVERT_0565_TO_8888 (s);
    5914               0 :         w--;
    5915                 :     }
    5916                 : 
    5917               0 :     return iter->buffer;
    5918                 : }
    5919                 : 
    5920                 : static uint32_t *
    5921               0 : sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
    5922                 : {
    5923               0 :     int w = iter->width;
    5924               0 :     uint32_t *dst = iter->buffer;
    5925               0 :     uint8_t *src = iter->bits;
    5926                 :     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
    5927                 : 
    5928               0 :     iter->bits += iter->stride;
    5929                 : 
    5930               0 :     while (w && (((unsigned long)dst) & 15))
    5931                 :     {
    5932               0 :         *dst++ = *(src++) << 24;
    5933               0 :         w--;
    5934                 :     }
    5935                 : 
    5936               0 :     while (w >= 16)
    5937                 :     {
    5938               0 :         xmm0 = _mm_loadu_si128((__m128i *)src);
    5939                 : 
    5940               0 :         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
    5941               0 :         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
    5942               0 :         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
    5943               0 :         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
    5944               0 :         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
    5945               0 :         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
    5946                 : 
    5947               0 :         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
    5948               0 :         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
    5949               0 :         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
    5950               0 :         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
    5951                 : 
    5952               0 :         dst += 16;
    5953               0 :         src += 16;
    5954               0 :         w -= 16;
    5955                 :     }
    5956                 : 
    5957               0 :     while (w)
    5958                 :     {
    5959               0 :         *dst++ = *(src++) << 24;
    5960               0 :         w--;
    5961                 :     }
    5962                 : 
    5963               0 :     return iter->buffer;
    5964                 : }
    5965                 : 
    5966                 : typedef struct
    5967                 : {
    5968                 :     pixman_format_code_t        format;
    5969                 :     pixman_iter_get_scanline_t  get_scanline;
    5970                 : } fetcher_info_t;
    5971                 : 
    5972                 : static const fetcher_info_t fetchers[] =
    5973                 : {
    5974                 :     { PIXMAN_x8r8g8b8,          sse2_fetch_x8r8g8b8 },
    5975                 :     { PIXMAN_r5g6b5,            sse2_fetch_r5g6b5 },
    5976                 :     { PIXMAN_a8,                sse2_fetch_a8 },
    5977                 :     { PIXMAN_null }
    5978                 : };
    5979                 : 
    5980                 : static void
    5981               0 : sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
    5982                 : {
    5983               0 :     pixman_image_t *image = iter->image;
    5984               0 :     int x = iter->x;
    5985               0 :     int y = iter->y;
    5986               0 :     int width = iter->width;
    5987               0 :     int height = iter->height;
    5988                 : 
    5989                 : #define FLAGS                                                           \
    5990                 :     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
    5991                 : 
    5992               0 :     if ((iter->flags & ITER_NARROW)                              &&
    5993               0 :         (image->common.flags & FLAGS) == FLAGS                   &&
    5994               0 :         x >= 0 && y >= 0                                  &&
    5995               0 :         x + width <= image->bits.width                            &&
    5996               0 :         y + height <= image->bits.height)
    5997                 :     {
    5998                 :         const fetcher_info_t *f;
    5999                 : 
    6000               0 :         for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
    6001                 :         {
    6002               0 :             if (image->common.extended_format_code == f->format)
    6003                 :             {
    6004               0 :                 uint8_t *b = (uint8_t *)image->bits.bits;
    6005               0 :                 int s = image->bits.rowstride * 4;
    6006                 : 
    6007               0 :                 iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
    6008               0 :                 iter->stride = s;
    6009                 : 
    6010               0 :                 iter->get_scanline = f->get_scanline;
    6011               0 :                 return;
    6012                 :             }
    6013                 :         }
    6014                 :     }
    6015                 : 
    6016               0 :     imp->delegate->src_iter_init (imp->delegate, iter);
    6017                 : }
    6018                 : 
    6019                 : #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
    6020                 : __attribute__((__force_align_arg_pointer__))
    6021                 : #endif
    6022                 : pixman_implementation_t *
    6023               4 : _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
    6024                 : {
    6025               4 :     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
    6026                 : 
    6027                 :     /* SSE2 constants */
    6028               4 :     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
    6029               4 :     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
    6030               4 :     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
    6031               4 :     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
    6032               4 :     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
    6033               4 :     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
    6034               4 :     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
    6035               4 :     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
    6036               4 :     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
    6037               4 :     mask_0080 = create_mask_16_128 (0x0080);
    6038               4 :     mask_00ff = create_mask_16_128 (0x00ff);
    6039               4 :     mask_0101 = create_mask_16_128 (0x0101);
    6040               4 :     mask_ffff = create_mask_16_128 (0xffff);
    6041               4 :     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
    6042               4 :     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
    6043                 : 
    6044                 :     /* Set up function pointers */
    6045               4 :     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
    6046               4 :     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
    6047               4 :     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
    6048               4 :     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
    6049               4 :     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
    6050               4 :     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
    6051               4 :     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
    6052               4 :     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
    6053               4 :     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
    6054               4 :     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
    6055                 : 
    6056               4 :     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
    6057                 : 
    6058               4 :     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
    6059               4 :     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
    6060               4 :     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
    6061               4 :     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
    6062               4 :     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
    6063               4 :     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
    6064               4 :     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
    6065               4 :     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
    6066               4 :     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
    6067               4 :     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
    6068               4 :     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
    6069                 : 
    6070               4 :     imp->blt = sse2_blt;
    6071               4 :     imp->fill = sse2_fill;
    6072                 : 
    6073               4 :     imp->src_iter_init = sse2_src_iter_init;
    6074                 : 
    6075               4 :     return imp;
    6076                 : }

Generated by: LCOV version 1.7