LCOV - code coverage report
Current view: directory - gfx/skia/src/opts - SkBlitRow_opts_SSE2.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 242 0 0.0 %
Date: 2012-06-02 Functions: 5 0 0.0 %

       1                 : 
       2                 : /*
       3                 :  * Copyright 2009 The Android Open Source Project
       4                 :  *
       5                 :  * Use of this source code is governed by a BSD-style license that can be
       6                 :  * found in the LICENSE file.
       7                 :  */
       8                 : 
       9                 : 
      10                 : #include "SkBlitRow_opts_SSE2.h"
      11                 : #include "SkColorPriv.h"
      12                 : #include "SkUtils.h"
      13                 : 
      14                 : #include <emmintrin.h>
      15                 : 
      16                 : /* SSE2 version of S32_Blend_BlitRow32()
      17                 :  * portable version is in core/SkBlitRow_D32.cpp
      18                 :  */
      19               0 : void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
      20                 :                               const SkPMColor* SK_RESTRICT src,
      21                 :                               int count, U8CPU alpha) {
      22               0 :     SkASSERT(alpha <= 255);
      23               0 :     if (count <= 0) {
      24               0 :         return;
      25                 :     }
      26                 : 
      27               0 :     uint32_t src_scale = SkAlpha255To256(alpha);
      28               0 :     uint32_t dst_scale = 256 - src_scale;
      29                 : 
      30               0 :     if (count >= 4) {
      31               0 :         SkASSERT(((size_t)dst & 0x03) == 0);
      32               0 :         while (((size_t)dst & 0x0F) != 0) {
      33               0 :             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
      34               0 :             src++;
      35               0 :             dst++;
      36               0 :             count--;
      37                 :         }
      38                 : 
      39               0 :         const __m128i *s = reinterpret_cast<const __m128i*>(src);
      40               0 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
      41               0 :         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
      42               0 :         __m128i src_scale_wide = _mm_set1_epi16(src_scale);
      43               0 :         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
      44               0 :         while (count >= 4) {
      45                 :             // Load 4 pixels each of src and dest.
      46               0 :             __m128i src_pixel = _mm_loadu_si128(s);
      47               0 :             __m128i dst_pixel = _mm_load_si128(d);
      48                 : 
      49                 :             // Get red and blue pixels into lower byte of each word.
      50               0 :             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
      51               0 :             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
      52                 : 
      53                 :             // Get alpha and green into lower byte of each word.
      54               0 :             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
      55               0 :             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
      56                 : 
      57                 :             // Multiply by scale.
      58               0 :             src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
      59               0 :             src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
      60               0 :             dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
      61               0 :             dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
      62                 : 
      63                 :             // Divide by 256.
      64               0 :             src_rb = _mm_srli_epi16(src_rb, 8);
      65               0 :             dst_rb = _mm_srli_epi16(dst_rb, 8);
      66               0 :             src_ag = _mm_andnot_si128(rb_mask, src_ag);
      67               0 :             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
      68                 : 
      69                 :             // Combine back into RGBA.
      70               0 :             src_pixel = _mm_or_si128(src_rb, src_ag);
      71               0 :             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
      72                 : 
      73                 :             // Add result
      74               0 :             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
      75               0 :             _mm_store_si128(d, result);
      76               0 :             s++;
      77               0 :             d++;
      78               0 :             count -= 4;
      79                 :         }
      80               0 :         src = reinterpret_cast<const SkPMColor*>(s);
      81               0 :         dst = reinterpret_cast<SkPMColor*>(d);
      82                 :     }
      83                 : 
      84               0 :     while (count > 0) {
      85               0 :         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
      86               0 :         src++;
      87               0 :         dst++;
      88               0 :         count--;
      89                 :     }
      90                 : }
      91                 : 
      92               0 : void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
      93                 :                                 const SkPMColor* SK_RESTRICT src,
      94                 :                                 int count, U8CPU alpha) {
      95               0 :     SkASSERT(alpha == 255);
      96               0 :     if (count <= 0) {
      97               0 :         return;
      98                 :     }
      99                 : 
     100               0 :     if (count >= 4) {
     101               0 :         SkASSERT(((size_t)dst & 0x03) == 0);
     102               0 :         while (((size_t)dst & 0x0F) != 0) {
     103               0 :             *dst = SkPMSrcOver(*src, *dst);
     104               0 :             src++;
     105               0 :             dst++;
     106               0 :             count--;
     107                 :         }
     108                 : 
     109               0 :         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     110               0 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
     111                 : #ifdef SK_USE_ACCURATE_BLENDING
     112                 :         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     113                 :         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
     114                 :         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
     115                 :         while (count >= 4) {
     116                 :             // Load 4 pixels
     117                 :             __m128i src_pixel = _mm_loadu_si128(s);
     118                 :             __m128i dst_pixel = _mm_load_si128(d);
     119                 : 
     120                 :             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     121                 :             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     122                 :             // Shift alphas down to lower 8 bits of each quad.
     123                 :             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
     124                 : 
     125                 :             // Copy alpha to upper 3rd byte of each quad
     126                 :             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
     127                 : 
     128                 :             // Subtract alphas from 255, to get 0..255
     129                 :             alpha = _mm_sub_epi16(c_255, alpha);
     130                 : 
     131                 :             // Multiply by red and blue by src alpha.
     132                 :             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
     133                 :             // Multiply by alpha and green by src alpha.
     134                 :             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
     135                 : 
     136                 :             // dst_rb_low = (dst_rb >> 8)
     137                 :             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
     138                 :             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
     139                 : 
     140                 :             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
     141                 :             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
     142                 :             dst_rb = _mm_add_epi16(dst_rb, c_128);
     143                 :             dst_rb = _mm_srli_epi16(dst_rb, 8);
     144                 : 
     145                 :             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
     146                 :             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
     147                 :             dst_ag = _mm_add_epi16(dst_ag, c_128);
     148                 :             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
     149                 : 
     150                 :             // Combine back into RGBA.
     151                 :             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     152                 : 
     153                 :             // Add result
     154                 :             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     155                 :             _mm_store_si128(d, result);
     156                 :             s++;
     157                 :             d++;
     158                 :             count -= 4;
     159                 :         }
     160                 :     #else
     161               0 :         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     162               0 :         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
     163               0 :         while (count >= 4) {
     164                 :             // Load 4 pixels
     165               0 :             __m128i src_pixel = _mm_loadu_si128(s);
     166               0 :             __m128i dst_pixel = _mm_load_si128(d);
     167                 : 
     168               0 :             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     169               0 :             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     170                 : 
     171                 :             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
     172               0 :             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
     173                 : 
     174                 :             // (a0, a0, a1, a1, a2, g2, a3, g3)
     175               0 :             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
     176                 : 
     177                 :             // (a0, a0, a1, a1, a2, a2, a3, a3)
     178               0 :             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
     179                 : 
     180                 :             // Subtract alphas from 256, to get 1..256
     181               0 :             alpha = _mm_sub_epi16(c_256, alpha);
     182                 : 
     183                 :             // Multiply by red and blue by src alpha.
     184               0 :             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
     185                 :             // Multiply by alpha and green by src alpha.
     186               0 :             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
     187                 : 
     188                 :             // Divide by 256.
     189               0 :             dst_rb = _mm_srli_epi16(dst_rb, 8);
     190                 : 
     191                 :             // Mask out high bits (already in the right place)
     192               0 :             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
     193                 : 
     194                 :             // Combine back into RGBA.
     195               0 :             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     196                 : 
     197                 :             // Add result
     198               0 :             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     199               0 :             _mm_store_si128(d, result);
     200               0 :             s++;
     201               0 :             d++;
     202               0 :             count -= 4;
     203                 :         }
     204                 : #endif
     205               0 :         src = reinterpret_cast<const SkPMColor*>(s);
     206               0 :         dst = reinterpret_cast<SkPMColor*>(d);
     207                 :     }
     208                 : 
     209               0 :     while (count > 0) {
     210               0 :         *dst = SkPMSrcOver(*src, *dst);
     211               0 :         src++;
     212               0 :         dst++;
     213               0 :         count--;
     214                 :     }
     215                 : }
     216                 : 
     217               0 : void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     218                 :                                const SkPMColor* SK_RESTRICT src,
     219                 :                                int count, U8CPU alpha) {
     220               0 :     SkASSERT(alpha <= 255);
     221               0 :     if (count <= 0) {
     222               0 :         return;
     223                 :     }
     224                 : 
     225               0 :     if (count >= 4) {
     226               0 :         while (((size_t)dst & 0x0F) != 0) {
     227               0 :             *dst = SkBlendARGB32(*src, *dst, alpha);
     228               0 :             src++;
     229               0 :             dst++;
     230               0 :             count--;
     231                 :         }
     232                 : 
     233               0 :         uint32_t src_scale = SkAlpha255To256(alpha);
     234                 : 
     235               0 :         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     236               0 :         __m128i *d = reinterpret_cast<__m128i*>(dst);
     237               0 :         __m128i src_scale_wide = _mm_set1_epi16(src_scale);
     238               0 :         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     239               0 :         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
     240               0 :         while (count >= 4) {
     241                 :             // Load 4 pixels each of src and dest.
     242               0 :             __m128i src_pixel = _mm_loadu_si128(s);
     243               0 :             __m128i dst_pixel = _mm_load_si128(d);
     244                 : 
     245                 :             // Get red and blue pixels into lower byte of each word.
     246               0 :             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     247               0 :             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     248                 : 
     249                 :             // Get alpha and green into lower byte of each word.
     250               0 :             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     251               0 :             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
     252                 : 
     253                 :             // Put per-pixel alpha in low byte of each word.
     254               0 :             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
     255               0 :             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
     256                 : 
     257                 :             // dst_alpha = dst_alpha * src_scale
     258               0 :             dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
     259                 : 
     260                 :             // Divide by 256.
     261               0 :             dst_alpha = _mm_srli_epi16(dst_alpha, 8);
     262                 : 
     263                 :             // Subtract alphas from 256, to get 1..256
     264               0 :             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
     265                 : 
     266                 :             // Multiply red and blue by dst pixel alpha.
     267               0 :             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
     268                 :             // Multiply alpha and green by dst pixel alpha.
     269               0 :             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
     270                 : 
     271                 :             // Multiply red and blue by global alpha.
     272               0 :             src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
     273                 :             // Multiply alpha and green by global alpha.
     274               0 :             src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
     275                 : 
     276                 :             // Divide by 256.
     277               0 :             dst_rb = _mm_srli_epi16(dst_rb, 8);
     278               0 :             src_rb = _mm_srli_epi16(src_rb, 8);
     279                 : 
     280                 :             // Mask out low bits (goodies already in the right place; no need to divide)
     281               0 :             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
     282               0 :             src_ag = _mm_andnot_si128(rb_mask, src_ag);
     283                 : 
     284                 :             // Combine back into RGBA.
     285               0 :             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     286               0 :             src_pixel = _mm_or_si128(src_rb, src_ag);
     287                 : 
     288                 :             // Add two pixels into result.
     289               0 :             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     290               0 :             _mm_store_si128(d, result);
     291               0 :             s++;
     292               0 :             d++;
     293               0 :             count -= 4;
     294                 :         }
     295               0 :         src = reinterpret_cast<const SkPMColor*>(s);
     296               0 :         dst = reinterpret_cast<SkPMColor*>(d);
     297                 :     }
     298                 : 
     299               0 :     while (count > 0) {
     300               0 :         *dst = SkBlendARGB32(*src, *dst, alpha);
     301               0 :         src++;
     302               0 :         dst++;
     303               0 :         count--;
     304                 :     }
     305                 : }
     306                 : 
     307                 : /* SSE2 version of Color32()
     308                 :  * portable version is in core/SkBlitRow_D32.cpp
     309                 :  */
     310               0 : void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
     311                 :                   SkPMColor color) {
     312                 : 
     313               0 :     if (count <= 0) {
     314               0 :         return;
     315                 :     }
     316                 : 
     317               0 :     if (0 == color) {
     318               0 :         if (src != dst) {
     319               0 :             memcpy(dst, src, count * sizeof(SkPMColor));
     320                 :         }
     321               0 :         return;
     322                 :     }
     323                 : 
     324               0 :     unsigned colorA = SkGetPackedA32(color);
     325               0 :     if (255 == colorA) {
     326               0 :         sk_memset32(dst, color, count);
     327                 :     } else {
     328               0 :         unsigned scale = 256 - SkAlpha255To256(colorA);
     329                 : 
     330               0 :         if (count >= 4) {
     331               0 :             SkASSERT(((size_t)dst & 0x03) == 0);
     332               0 :             while (((size_t)dst & 0x0F) != 0) {
     333               0 :                 *dst = color + SkAlphaMulQ(*src, scale);
     334               0 :                 src++;
     335               0 :                 dst++;
     336               0 :                 count--;
     337                 :             }
     338                 : 
     339               0 :             const __m128i *s = reinterpret_cast<const __m128i*>(src);
     340               0 :             __m128i *d = reinterpret_cast<__m128i*>(dst);
     341               0 :             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     342               0 :             __m128i src_scale_wide = _mm_set1_epi16(scale);
     343               0 :             __m128i color_wide = _mm_set1_epi32(color);
     344               0 :             while (count >= 4) {
     345                 :                 // Load 4 pixels each of src and dest.
     346               0 :                 __m128i src_pixel = _mm_loadu_si128(s);
     347                 : 
     348                 :                 // Get red and blue pixels into lower byte of each word.
     349               0 :                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     350                 : 
     351                 :                 // Get alpha and green into lower byte of each word.
     352               0 :                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
     353                 : 
     354                 :                 // Multiply by scale.
     355               0 :                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
     356               0 :                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
     357                 : 
     358                 :                 // Divide by 256.
     359               0 :                 src_rb = _mm_srli_epi16(src_rb, 8);
     360               0 :                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
     361                 : 
     362                 :                 // Combine back into RGBA.
     363               0 :                 src_pixel = _mm_or_si128(src_rb, src_ag);
     364                 : 
     365                 :                 // Add color to result.
     366               0 :                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
     367                 : 
     368                 :                 // Store result.
     369               0 :                 _mm_store_si128(d, result);
     370               0 :                 s++;
     371               0 :                 d++;
     372               0 :                 count -= 4;
     373                 :             }
     374               0 :             src = reinterpret_cast<const SkPMColor*>(s);
     375               0 :             dst = reinterpret_cast<SkPMColor*>(d);
     376                 :          }
     377                 : 
     378               0 :         while (count > 0) {
     379               0 :             *dst = color + SkAlphaMulQ(*src, scale);
     380               0 :             src += 1;
     381               0 :             dst += 1;
     382               0 :             count--;
     383                 :         }
     384                 :     }
     385                 : }
     386                 : 
     387               0 : void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
     388                 :                                size_t maskRB, SkColor origColor,
     389                 :                                int width, int height)
     390                 : {
     391               0 :     SkPMColor color = SkPreMultiplyColor(origColor);
     392               0 :     size_t dstOffset = dstRB - (width << 2);
     393               0 :     size_t maskOffset = maskRB - width;
     394               0 :     SkPMColor* dst = (SkPMColor *)device;
     395               0 :     const uint8_t* mask = (const uint8_t*)maskPtr;
     396               0 :     do {
     397               0 :         int count = width;
     398               0 :         if (count >= 4) {
     399               0 :             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
     400               0 :                 *dst = SkBlendARGB32(color, *dst, *mask);
     401               0 :                 mask++;
     402               0 :                 dst++;
     403               0 :                 count--;
     404                 :             }
     405               0 :             __m128i *d = reinterpret_cast<__m128i*>(dst);
     406               0 :             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     407               0 :             __m128i c_256 = _mm_set1_epi16(256);
     408               0 :             __m128i c_1 = _mm_set1_epi16(1);
     409               0 :             __m128i src_pixel = _mm_set1_epi32(color);
     410               0 :             while (count >= 4) {
     411                 :                 // Load 4 pixels each of src and dest.
     412               0 :                 __m128i dst_pixel = _mm_load_si128(d);
     413                 : 
     414                 :                 //set the aphla value
     415               0 :                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
     416               0 :                                 0, *(mask+3),0, \
     417               0 :                                 *(mask+2),0, *(mask+2),\
     418               0 :                                 0,*(mask+1), 0,*(mask+1),\
     419               0 :                                 0, *mask,0,*mask);
     420                 : 
     421                 :                 //call SkAlpha255To256()
     422               0 :                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
     423                 : 
     424                 :                 // Get red and blue pixels into lower byte of each word.
     425               0 :                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     426               0 :                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     427                 : 
     428                 :                 // Get alpha and green into lower byte of each word.
     429               0 :                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     430               0 :                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
     431                 : 
     432                 :                 // Put per-pixel alpha in low byte of each word.
     433               0 :                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
     434               0 :                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
     435                 : 
     436                 :                 // dst_alpha = dst_alpha * src_scale
     437               0 :                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
     438                 : 
     439                 :                 // Divide by 256.
     440               0 :                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
     441                 : 
     442                 :                 // Subtract alphas from 256, to get 1..256
     443               0 :                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
     444                 :                 // Multiply red and blue by dst pixel alpha.
     445               0 :                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
     446                 :                 // Multiply alpha and green by dst pixel alpha.
     447               0 :                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
     448                 : 
     449                 :                 // Multiply red and blue by global alpha.
     450               0 :                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
     451                 :                 // Multiply alpha and green by global alpha.
     452               0 :                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
     453                 :                 // Divide by 256.
     454               0 :                 dst_rb = _mm_srli_epi16(dst_rb, 8);
     455               0 :                 src_rb = _mm_srli_epi16(src_rb, 8);
     456                 : 
     457                 :                 // Mask out low bits (goodies already in the right place; no need to divide)
     458               0 :                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
     459               0 :                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
     460                 : 
     461                 :                 // Combine back into RGBA.
     462               0 :                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     463               0 :                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
     464                 : 
     465                 :                 // Add two pixels into result.
     466               0 :                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
     467               0 :                 _mm_store_si128(d, result);
     468                 :                 // load the next 4 pixel
     469               0 :                 mask = mask + 4;
     470               0 :                 d++;
     471               0 :                 count -= 4;
     472                 :             }
     473               0 :             dst = reinterpret_cast<SkPMColor *>(d);
     474                 :         }
     475               0 :         while(count > 0) {
     476               0 :             *dst= SkBlendARGB32(color, *dst, *mask);
     477               0 :             dst += 1;
     478               0 :             mask++;
     479               0 :             count --;
     480                 :         }
     481               0 :         dst = (SkPMColor *)((char*)dst + dstOffset);
     482               0 :         mask += maskOffset;
     483                 :     } while (--height != 0);
     484               0 : }

Generated by: LCOV version 1.7