LCOV - code coverage report
Current view: directory - gfx/skia/src/opts - SkBitmapProcState_opts_SSE2.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 97 0 0.0 %
Date: 2012-06-02 Functions: 2 0 0.0 %

       1                 : 
       2                 : /*
       3                 :  * Copyright 2009 The Android Open Source Project
       4                 :  *
       5                 :  * Use of this source code is governed by a BSD-style license that can be
       6                 :  * found in the LICENSE file.
       7                 :  */
       8                 : 
       9                 : 
      10                 : #include <emmintrin.h>
      11                 : #include "SkBitmapProcState_opts_SSE2.h"
      12                 : #include "SkUtils.h"
      13                 : 
      14               0 : void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
      15                 :                                    const uint32_t* xy,
      16                 :                                    int count, uint32_t* colors) {
      17               0 :     SkASSERT(count > 0 && colors != NULL);
      18               0 :     SkASSERT(s.fDoFilter);
      19               0 :     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
      20               0 :     SkASSERT(s.fAlphaScale == 256);
      21                 : 
      22               0 :     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
      23               0 :     unsigned rb = s.fBitmap->rowBytes();
      24               0 :     uint32_t XY = *xy++;
      25               0 :     unsigned y0 = XY >> 14;
      26               0 :     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
      27               0 :     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
      28               0 :     unsigned subY = y0 & 0xF;
      29                 : 
      30                 :     // ( 0,  0,  0,  0,  0,  0,  0, 16)
      31               0 :     __m128i sixteen = _mm_cvtsi32_si128(16);
      32                 : 
      33                 :     // ( 0,  0,  0,  0, 16, 16, 16, 16)
      34               0 :     sixteen = _mm_shufflelo_epi16(sixteen, 0);
      35                 : 
      36                 :     // ( 0,  0,  0,  0,  0,  0,  0,  y)
      37               0 :     __m128i allY = _mm_cvtsi32_si128(subY);
      38                 : 
      39                 :     // ( 0,  0,  0,  0,  y,  y,  y,  y)
      40               0 :     allY = _mm_shufflelo_epi16(allY, 0);
      41                 : 
      42                 :     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
      43               0 :     __m128i negY = _mm_sub_epi16(sixteen, allY);
      44                 : 
      45                 :     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
      46               0 :     allY = _mm_unpacklo_epi64(allY, negY);
      47                 : 
      48                 :     // (16, 16, 16, 16, 16, 16, 16, 16 )
      49               0 :     sixteen = _mm_shuffle_epi32(sixteen, 0);
      50                 : 
      51                 :     // ( 0,  0,  0,  0,  0,  0,  0,  0)
      52               0 :     __m128i zero = _mm_setzero_si128();
      53               0 :     do {
      54               0 :         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
      55               0 :         unsigned x0 = XX >> 18;
      56               0 :         unsigned x1 = XX & 0x3FFF;
      57                 : 
      58                 :         // (0, 0, 0, 0, 0, 0, 0, x)
      59               0 :         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
      60                 :         
      61                 :         // (0, 0, 0, 0, x, x, x, x)
      62               0 :         allX = _mm_shufflelo_epi16(allX, 0);
      63                 : 
      64                 :         // (x, x, x, x, x, x, x, x)
      65               0 :         allX = _mm_shuffle_epi32(allX, 0);
      66                 : 
      67                 :         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
      68               0 :         __m128i negX = _mm_sub_epi16(sixteen, allX);
      69                 : 
      70                 :         // Load 4 samples (pixels).
      71               0 :         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
      72               0 :         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
      73               0 :         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
      74               0 :         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
      75                 : 
      76                 :         // (0, 0, a00, a10)
      77               0 :         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
      78                 : 
      79                 :         // Expand to 16 bits per component.
      80               0 :         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
      81                 : 
      82                 :         // ((a00 * (16-y)), (a10 * y)).
      83               0 :         a00a10 = _mm_mullo_epi16(a00a10, allY);
      84                 : 
      85                 :         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
      86               0 :         a00a10 = _mm_mullo_epi16(a00a10, negX);
      87                 : 
      88                 :         // (0, 0, a01, a10)
      89               0 :         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
      90                 : 
      91                 :         // Expand to 16 bits per component.
      92               0 :         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
      93                 : 
      94                 :         // (a01 * (16-y)), (a11 * y)
      95               0 :         a01a11 = _mm_mullo_epi16(a01a11, allY);
      96                 : 
      97                 :         // (a01 * (16-y) * x), (a11 * y * x)
      98               0 :         a01a11 = _mm_mullo_epi16(a01a11, allX);
      99                 : 
     100                 :         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
     101               0 :         __m128i sum = _mm_add_epi16(a00a10, a01a11);
     102                 : 
     103                 :         // (DC, a00*w00 + a01*w01)
     104               0 :         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
     105                 : 
     106                 :         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
     107               0 :         sum = _mm_add_epi16(sum, shifted);
     108                 : 
     109                 :         // Divide each 16 bit component by 256.
     110               0 :         sum = _mm_srli_epi16(sum, 8);
     111                 : 
     112                 :         // Pack lower 4 16 bit values of sum into lower 4 bytes.
     113               0 :         sum = _mm_packus_epi16(sum, zero);
     114                 : 
     115                 :         // Extract low int and store.
     116               0 :         *colors++ = _mm_cvtsi128_si32(sum);
     117                 :     } while (--count > 0);
     118               0 : }
     119                 : 
     120               0 : void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
     121                 :                                   const uint32_t* xy,
     122                 :                                   int count, uint32_t* colors) {
     123               0 :     SkASSERT(count > 0 && colors != NULL);
     124               0 :     SkASSERT(s.fDoFilter);
     125               0 :     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
     126               0 :     SkASSERT(s.fAlphaScale < 256);
     127                 : 
     128               0 :     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
     129               0 :     unsigned rb = s.fBitmap->rowBytes();
     130               0 :     uint32_t XY = *xy++;
     131               0 :     unsigned y0 = XY >> 14;
     132               0 :     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
     133               0 :     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
     134               0 :     unsigned subY = y0 & 0xF;
     135                 : 
     136                 :     // ( 0,  0,  0,  0,  0,  0,  0, 16)
     137               0 :     __m128i sixteen = _mm_cvtsi32_si128(16);
     138                 : 
     139                 :     // ( 0,  0,  0,  0, 16, 16, 16, 16)
     140               0 :     sixteen = _mm_shufflelo_epi16(sixteen, 0);
     141                 : 
     142                 :     // ( 0,  0,  0,  0,  0,  0,  0,  y)
     143               0 :     __m128i allY = _mm_cvtsi32_si128(subY);
     144                 : 
     145                 :     // ( 0,  0,  0,  0,  y,  y,  y,  y)
     146               0 :     allY = _mm_shufflelo_epi16(allY, 0);
     147                 : 
     148                 :     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
     149               0 :     __m128i negY = _mm_sub_epi16(sixteen, allY);
     150                 : 
     151                 :     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
     152               0 :     allY = _mm_unpacklo_epi64(allY, negY);
     153                 : 
     154                 :     // (16, 16, 16, 16, 16, 16, 16, 16 )
     155               0 :     sixteen = _mm_shuffle_epi32(sixteen, 0);
     156                 : 
     157                 :     // ( 0,  0,  0,  0,  0,  0,  0,  0)
     158               0 :     __m128i zero = _mm_setzero_si128();
     159                 : 
     160                 :     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
     161               0 :     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
     162                 : 
     163               0 :     do {
     164               0 :         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
     165               0 :         unsigned x0 = XX >> 18;
     166               0 :         unsigned x1 = XX & 0x3FFF;
     167                 : 
     168                 :         // (0, 0, 0, 0, 0, 0, 0, x)
     169               0 :         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
     170                 :         
     171                 :         // (0, 0, 0, 0, x, x, x, x)
     172               0 :         allX = _mm_shufflelo_epi16(allX, 0);
     173                 : 
     174                 :         // (x, x, x, x, x, x, x, x)
     175               0 :         allX = _mm_shuffle_epi32(allX, 0);
     176                 : 
     177                 :         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
     178               0 :         __m128i negX = _mm_sub_epi16(sixteen, allX);
     179                 : 
     180                 :         // Load 4 samples (pixels).
     181               0 :         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
     182               0 :         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
     183               0 :         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
     184               0 :         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
     185                 : 
     186                 :         // (0, 0, a00, a10)
     187               0 :         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
     188                 : 
     189                 :         // Expand to 16 bits per component.
     190               0 :         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
     191                 : 
     192                 :         // ((a00 * (16-y)), (a10 * y)).
     193               0 :         a00a10 = _mm_mullo_epi16(a00a10, allY);
     194                 : 
     195                 :         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
     196               0 :         a00a10 = _mm_mullo_epi16(a00a10, negX);
     197                 : 
     198                 :         // (0, 0, a01, a10)
     199               0 :         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
     200                 : 
     201                 :         // Expand to 16 bits per component.
     202               0 :         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
     203                 : 
     204                 :         // (a01 * (16-y)), (a11 * y)
     205               0 :         a01a11 = _mm_mullo_epi16(a01a11, allY);
     206                 : 
     207                 :         // (a01 * (16-y) * x), (a11 * y * x)
     208               0 :         a01a11 = _mm_mullo_epi16(a01a11, allX);
     209                 : 
     210                 :         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
     211               0 :         __m128i sum = _mm_add_epi16(a00a10, a01a11);
     212                 : 
     213                 :         // (DC, a00*w00 + a01*w01)
     214               0 :         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
     215                 : 
     216                 :         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
     217               0 :         sum = _mm_add_epi16(sum, shifted);
     218                 : 
     219                 :         // Divide each 16 bit component by 256.
     220               0 :         sum = _mm_srli_epi16(sum, 8);
     221                 : 
     222                 :         // Multiply by alpha.
     223               0 :         sum = _mm_mullo_epi16(sum, alpha);
     224                 : 
     225                 :         // Divide each 16 bit component by 256.
     226               0 :         sum = _mm_srli_epi16(sum, 8);
     227                 : 
     228                 :         // Pack lower 4 16 bit values of sum into lower 4 bytes.
     229               0 :         sum = _mm_packus_epi16(sum, zero);
     230                 : 
     231                 :         // Extract low int and store.
     232               0 :         *colors++ = _mm_cvtsi128_si32(sum);
     233                 :     } while (--count > 0);
     234               0 : }

Generated by: LCOV version 1.7