LCOV - code coverage report
Current view: directory - gfx/thebes - gfxAlphaRecoverySSE2.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 94 0 0.0 %
Date: 2012-06-02 Functions: 3 0 0.0 %

       1                 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2                 :  * ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Thebes gfx.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is Oracle Corporation.
      18                 :  * Portions created by the Initial Developer are Copyright (C) 2010
      19                 :  * the Initial Developer. All Rights Reserved.
      20                 :  *
      21                 :  * Contributor(s):
      22                 :  *
      23                 :  * Alternatively, the contents of this file may be used under the terms of
      24                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      25                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      26                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      27                 :  * of those above. If you wish to allow use of your version of this file only
      28                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      29                 :  * use your version of this file under the terms of the MPL, indicate your
      30                 :  * decision by deleting the provisions above and replace them with the notice
      31                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      32                 :  * the provisions above, a recipient may use your version of this file under
      33                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      34                 :  *
      35                 :  * ***** END LICENSE BLOCK ***** */
      36                 : 
      37                 : #include "mozilla/SSE.h"
      38                 : #include "gfxAlphaRecovery.h"
      39                 : #include <emmintrin.h>
      40                 : 
      41                 : // This file should only be compiled on x86 and x64 systems.  Additionally,
      42                 : // you'll need to compile it with -msse2 if you're using GCC on x86.
      43                 : 
      44                 : #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
      45                 : __declspec(align(16)) static PRUint32 greenMaski[] =
      46                 :     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      47                 : __declspec(align(16)) static PRUint32 alphaMaski[] =
      48                 :     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      49                 : #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
      50                 : static PRUint32 greenMaski[] __attribute__ ((aligned (16))) =
      51                 :     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      52                 : static PRUint32 alphaMaski[] __attribute__ ((aligned (16))) =
      53                 :     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      54                 : #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
      55                 : #pragma align 16 (greenMaski, alphaMaski)
      56                 : static PRUint32 greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
      57                 : static PRUint32 alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
      58                 : #endif
      59                 : 
      60                 : bool
      61               0 : gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
      62                 :                                    const gfxImageSurface* whiteSurf)
      63                 : {
      64               0 :     gfxIntSize size = blackSurf->GetSize();
      65                 : 
      66               0 :     if (size != whiteSurf->GetSize() ||
      67               0 :         (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
      68               0 :          blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
      69               0 :         (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
      70               0 :          whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
      71               0 :         return false;
      72                 : 
      73               0 :     blackSurf->Flush();
      74               0 :     whiteSurf->Flush();
      75                 : 
      76               0 :     unsigned char* blackData = blackSurf->Data();
      77               0 :     unsigned char* whiteData = whiteSurf->Data();
      78                 : 
      79               0 :     if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
      80               0 :         (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
      81                 :         // Cannot keep these in alignment.
      82               0 :         return false;
      83                 :     }
      84                 : 
      85               0 :     __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
      86               0 :     __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
      87                 : 
      88               0 :     for (PRInt32 i = 0; i < size.height; ++i) {
      89               0 :         PRInt32 j = 0;
      90                 :         // Loop single pixels until at 4 byte alignment.
      91               0 :         while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
      92                 :             *((PRUint32*)blackData) =
      93                 :                 RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
      94               0 :                              *reinterpret_cast<PRUint32*>(whiteData));
      95               0 :             blackData += 4;
      96               0 :             whiteData += 4;
      97               0 :             j++;
      98                 :         }
      99                 :         // This extra loop allows the compiler to do some more clever registry
     100                 :         // management and makes it about 5% faster than with only the 4 pixel
     101                 :         // at a time loop.
     102               0 :         for (; j < size.width - 8; j += 8) {
     103               0 :             __m128i black1 = _mm_load_si128((__m128i*)blackData);
     104               0 :             __m128i white1 = _mm_load_si128((__m128i*)whiteData);
     105               0 :             __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
     106               0 :             __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
     107                 : 
     108                 :             // Execute the same instructions as described in RecoverPixel, only
     109                 :             // using an SSE2 packed saturated subtract.
     110               0 :             white1 = _mm_subs_epu8(white1, black1);
     111               0 :             white2 = _mm_subs_epu8(white2, black2);
     112               0 :             white1 = _mm_subs_epu8(greenMask, white1);
     113               0 :             white2 = _mm_subs_epu8(greenMask, white2);
     114                 :             // Producing the final black pixel in an XMM register and storing
     115                 :             // that is actually faster than doing a masked store since that
     116                 :             // does an unaligned storage. We have the black pixel in a register
     117                 :             // anyway.
     118               0 :             black1 = _mm_andnot_si128(alphaMask, black1);
     119               0 :             black2 = _mm_andnot_si128(alphaMask, black2);
     120               0 :             white1 = _mm_slli_si128(white1, 2);
     121               0 :             white2 = _mm_slli_si128(white2, 2);
     122               0 :             white1 = _mm_and_si128(alphaMask, white1);
     123               0 :             white2 = _mm_and_si128(alphaMask, white2);
     124               0 :             black1 = _mm_or_si128(white1, black1);
     125               0 :             black2 = _mm_or_si128(white2, black2);
     126                 : 
     127               0 :             _mm_store_si128((__m128i*)blackData, black1);
     128               0 :             _mm_store_si128((__m128i*)(blackData + 16), black2);
     129               0 :             blackData += 32;
     130               0 :             whiteData += 32;
     131                 :         }
     132               0 :         for (; j < size.width - 4; j += 4) {
     133               0 :             __m128i black = _mm_load_si128((__m128i*)blackData);
     134               0 :             __m128i white = _mm_load_si128((__m128i*)whiteData);
     135                 : 
     136               0 :             white = _mm_subs_epu8(white, black);
     137               0 :             white = _mm_subs_epu8(greenMask, white);
     138               0 :             black = _mm_andnot_si128(alphaMask, black);
     139               0 :             white = _mm_slli_si128(white, 2);
     140               0 :             white = _mm_and_si128(alphaMask, white);
     141               0 :             black = _mm_or_si128(white, black);
     142               0 :             _mm_store_si128((__m128i*)blackData, black);
     143               0 :             blackData += 16;
     144               0 :             whiteData += 16;
     145                 :         }
     146                 :         // Loop single pixels until we're done.
     147               0 :         while (j < size.width) {
     148                 :             *((PRUint32*)blackData) =
     149                 :                 RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
     150               0 :                              *reinterpret_cast<PRUint32*>(whiteData));
     151               0 :             blackData += 4;
     152               0 :             whiteData += 4;
     153               0 :             j++;
     154                 :         }
     155               0 :         blackData += blackSurf->Stride() - j * 4;
     156               0 :         whiteData += whiteSurf->Stride() - j * 4;
     157                 :     }
     158                 : 
     159               0 :     blackSurf->MarkDirty();
     160                 : 
     161               0 :     return true;
     162                 : }
     163                 : 
     164                 : static PRInt32
     165               0 : ByteAlignment(PRInt32 aAlignToLog2, PRInt32 aX, PRInt32 aY=0, PRInt32 aStride=1)
     166                 : {
     167               0 :     return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
     168                 : }
     169                 : 
     170                 : /*static*/ nsIntRect
     171               0 : gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect,
     172                 :                                                gfxImageSurface* aSurface)
     173                 : {
     174               0 :     NS_ASSERTION(gfxASurface::ImageFormatARGB32 == aSurface->Format(),
     175                 :                  "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
     176               0 :     static const PRInt32 kByteAlignLog2 = GoodAlignmentLog2();
     177                 :     static const PRInt32 bpp = 4;
     178               0 :     static const PRInt32 pixPerAlign = (1 << kByteAlignLog2) / bpp;
     179                 :     //
     180                 :     // We're going to create a subimage of the surface with size
     181                 :     // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
     182                 :     // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
     183                 :     // properly aligned for SIMD.  So we want to find a rect <x',y',
     184                 :     // w',h'> that's a superset of what needs to be redrawn but is
     185                 :     // properly aligned.  Proper alignment is
     186                 :     //
     187                 :     //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
     188                 :     //   BPP * w'             \cong BPP * sw  (mod ALIGN)
     189                 :     //
     190                 :     // (We assume the pixel at surface <0,0> is already ALIGN'd.)
     191                 :     // That rect (obviously) has to fit within the surface bounds, and
     192                 :     // we should also minimize the extra pixels redrawn only for
     193                 :     // alignment's sake.  So we also want
     194                 :     //
     195                 :     //  minimize <x',y', w',h'>
     196                 :     //   0 <= x' <= x
     197                 :     //   0 <= y' <= y
     198                 :     //   w <= w' <= sw
     199                 :     //   h <= h' <= sh
     200                 :     //
     201                 :     // This is a messy integer non-linear programming problem, except
     202                 :     // ... we can assume that ALIGN/BPP is a very small constant.  So,
     203                 :     // brute force is viable.  The algorithm below will find a
     204                 :     // solution if one exists, but isn't guaranteed to find the
     205                 :     // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
     206                 :     // most 64 iterations below).  In what's likely the common case,
     207                 :     // an already-aligned rectangle, it only needs 1 iteration.
     208                 :     //
     209                 :     // Is this alignment worth doing?  Recovering alpha will take work
     210                 :     // proportional to w*h (assuming alpha recovery computation isn't
     211                 :     // memory bound).  This analysis can lead to O(w+h) extra work
     212                 :     // (with small constants).  In exchange, we expect to shave off a
     213                 :     // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
     214                 :     // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
     215                 :     // only really care about the w*h >> w+h case anyway; others
     216                 :     // should be fast enough even with the overhead.  (Unless the cost
     217                 :     // of repainting the expanded rect is high, but in that case
     218                 :     // SIMD-ized alpha recovery won't make a difference so this code
     219                 :     // shouldn't be called.)
     220                 :     //
     221               0 :     gfxIntSize surfaceSize = aSurface->GetSize();
     222               0 :     const PRInt32 stride = bpp * surfaceSize.width;
     223               0 :     if (stride != aSurface->Stride()) {
     224               0 :         NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
     225               0 :         return aRect;
     226                 :     }
     227                 : 
     228               0 :     const PRInt32 x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
     229               0 :     const PRInt32 r = x + w;
     230               0 :     const PRInt32 sw = surfaceSize.width;
     231               0 :     const PRInt32 strideAlign = ByteAlignment(kByteAlignLog2, stride);
     232                 : 
     233                 :     // The outer two loops below keep the rightmost (|r| above) and
     234                 :     // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
     235                 :     // return only a superset of the original rect.  These loops
     236                 :     // search for an aligned top-left pixel by trying to expand <x,y>
     237                 :     // left and up by <dx,dy> pixels, respectively.
     238                 :     //
     239                 :     // Then if a properly-aligned top-left pixel is found, the
     240                 :     // innermost loop tries to find an aligned stride by moving the
     241                 :     // rightmost pixel rightward by dr.
     242                 :     PRInt32 dx, dy, dr;
     243               0 :     for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
     244               0 :         for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
     245               0 :             if (0 != ByteAlignment(kByteAlignLog2,
     246               0 :                                    bpp * (x - dx), y - dy, stride)) {
     247               0 :                 continue;
     248                 :             }
     249               0 :             for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
     250               0 :                 if (strideAlign == ByteAlignment(kByteAlignLog2,
     251               0 :                                                  bpp * (w + dr + dx))) {
     252               0 :                     goto FOUND_SOLUTION;
     253                 :                 }
     254                 :             }
     255                 :         }
     256                 :     }
     257                 : 
     258                 :     // Didn't find a solution.
     259               0 :     return aRect;
     260                 : 
     261                 : FOUND_SOLUTION:
     262               0 :     nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy);
     263               0 :     NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution),
     264                 :                       "'Solution' extends outside surface bounds!");
     265               0 :     return solution;
     266                 : }

Generated by: LCOV version 1.7