1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Thebes gfx.
16 : *
17 : * The Initial Developer of the Original Code is Oracle Corporation.
18 : * Portions created by the Initial Developer are Copyright (C) 2010
19 : * the Initial Developer. All Rights Reserved.
20 : *
21 : * Contributor(s):
22 : *
23 : * Alternatively, the contents of this file may be used under the terms of
24 : * either the GNU General Public License Version 2 or later (the "GPL"), or
25 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 : * in which case the provisions of the GPL or the LGPL are applicable instead
27 : * of those above. If you wish to allow use of your version of this file only
28 : * under the terms of either the GPL or the LGPL, and not to allow others to
29 : * use your version of this file under the terms of the MPL, indicate your
30 : * decision by deleting the provisions above and replace them with the notice
31 : * and other provisions required by the GPL or the LGPL. If you do not delete
32 : * the provisions above, a recipient may use your version of this file under
33 : * the terms of any one of the MPL, the GPL or the LGPL.
34 : *
35 : * ***** END LICENSE BLOCK ***** */
36 :
37 : #include "mozilla/SSE.h"
38 : #include "gfxAlphaRecovery.h"
39 : #include <emmintrin.h>
40 :
41 : // This file should only be compiled on x86 and x64 systems. Additionally,
42 : // you'll need to compile it with -msse2 if you're using GCC on x86.
43 :
44 : #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
45 : __declspec(align(16)) static PRUint32 greenMaski[] =
46 : { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
47 : __declspec(align(16)) static PRUint32 alphaMaski[] =
48 : { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
49 : #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
50 : static PRUint32 greenMaski[] __attribute__ ((aligned (16))) =
51 : { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
52 : static PRUint32 alphaMaski[] __attribute__ ((aligned (16))) =
53 : { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
54 : #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
55 : #pragma align 16 (greenMaski, alphaMaski)
56 : static PRUint32 greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
57 : static PRUint32 alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
58 : #endif
59 :
60 : bool
61 0 : gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
62 : const gfxImageSurface* whiteSurf)
63 : {
64 0 : gfxIntSize size = blackSurf->GetSize();
65 :
66 0 : if (size != whiteSurf->GetSize() ||
67 0 : (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
68 0 : blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
69 0 : (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
70 0 : whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
71 0 : return false;
72 :
73 0 : blackSurf->Flush();
74 0 : whiteSurf->Flush();
75 :
76 0 : unsigned char* blackData = blackSurf->Data();
77 0 : unsigned char* whiteData = whiteSurf->Data();
78 :
79 0 : if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
80 0 : (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
81 : // Cannot keep these in alignment.
82 0 : return false;
83 : }
84 :
85 0 : __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
86 0 : __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
87 :
88 0 : for (PRInt32 i = 0; i < size.height; ++i) {
89 0 : PRInt32 j = 0;
90 : // Loop single pixels until at 4 byte alignment.
91 0 : while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
92 : *((PRUint32*)blackData) =
93 : RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
94 0 : *reinterpret_cast<PRUint32*>(whiteData));
95 0 : blackData += 4;
96 0 : whiteData += 4;
97 0 : j++;
98 : }
99 : // This extra loop allows the compiler to do some more clever registry
100 : // management and makes it about 5% faster than with only the 4 pixel
101 : // at a time loop.
102 0 : for (; j < size.width - 8; j += 8) {
103 0 : __m128i black1 = _mm_load_si128((__m128i*)blackData);
104 0 : __m128i white1 = _mm_load_si128((__m128i*)whiteData);
105 0 : __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
106 0 : __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
107 :
108 : // Execute the same instructions as described in RecoverPixel, only
109 : // using an SSE2 packed saturated subtract.
110 0 : white1 = _mm_subs_epu8(white1, black1);
111 0 : white2 = _mm_subs_epu8(white2, black2);
112 0 : white1 = _mm_subs_epu8(greenMask, white1);
113 0 : white2 = _mm_subs_epu8(greenMask, white2);
114 : // Producing the final black pixel in an XMM register and storing
115 : // that is actually faster than doing a masked store since that
116 : // does an unaligned storage. We have the black pixel in a register
117 : // anyway.
118 0 : black1 = _mm_andnot_si128(alphaMask, black1);
119 0 : black2 = _mm_andnot_si128(alphaMask, black2);
120 0 : white1 = _mm_slli_si128(white1, 2);
121 0 : white2 = _mm_slli_si128(white2, 2);
122 0 : white1 = _mm_and_si128(alphaMask, white1);
123 0 : white2 = _mm_and_si128(alphaMask, white2);
124 0 : black1 = _mm_or_si128(white1, black1);
125 0 : black2 = _mm_or_si128(white2, black2);
126 :
127 0 : _mm_store_si128((__m128i*)blackData, black1);
128 0 : _mm_store_si128((__m128i*)(blackData + 16), black2);
129 0 : blackData += 32;
130 0 : whiteData += 32;
131 : }
132 0 : for (; j < size.width - 4; j += 4) {
133 0 : __m128i black = _mm_load_si128((__m128i*)blackData);
134 0 : __m128i white = _mm_load_si128((__m128i*)whiteData);
135 :
136 0 : white = _mm_subs_epu8(white, black);
137 0 : white = _mm_subs_epu8(greenMask, white);
138 0 : black = _mm_andnot_si128(alphaMask, black);
139 0 : white = _mm_slli_si128(white, 2);
140 0 : white = _mm_and_si128(alphaMask, white);
141 0 : black = _mm_or_si128(white, black);
142 0 : _mm_store_si128((__m128i*)blackData, black);
143 0 : blackData += 16;
144 0 : whiteData += 16;
145 : }
146 : // Loop single pixels until we're done.
147 0 : while (j < size.width) {
148 : *((PRUint32*)blackData) =
149 : RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
150 0 : *reinterpret_cast<PRUint32*>(whiteData));
151 0 : blackData += 4;
152 0 : whiteData += 4;
153 0 : j++;
154 : }
155 0 : blackData += blackSurf->Stride() - j * 4;
156 0 : whiteData += whiteSurf->Stride() - j * 4;
157 : }
158 :
159 0 : blackSurf->MarkDirty();
160 :
161 0 : return true;
162 : }
163 :
164 : static PRInt32
165 0 : ByteAlignment(PRInt32 aAlignToLog2, PRInt32 aX, PRInt32 aY=0, PRInt32 aStride=1)
166 : {
167 0 : return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
168 : }
169 :
170 : /*static*/ nsIntRect
171 0 : gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect,
172 : gfxImageSurface* aSurface)
173 : {
174 0 : NS_ASSERTION(gfxASurface::ImageFormatARGB32 == aSurface->Format(),
175 : "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
176 0 : static const PRInt32 kByteAlignLog2 = GoodAlignmentLog2();
177 : static const PRInt32 bpp = 4;
178 0 : static const PRInt32 pixPerAlign = (1 << kByteAlignLog2) / bpp;
179 : //
180 : // We're going to create a subimage of the surface with size
181 : // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
182 : // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
183 : // properly aligned for SIMD. So we want to find a rect <x',y',
184 : // w',h'> that's a superset of what needs to be redrawn but is
185 : // properly aligned. Proper alignment is
186 : //
187 : // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
188 : // BPP * w' \cong BPP * sw (mod ALIGN)
189 : //
190 : // (We assume the pixel at surface <0,0> is already ALIGN'd.)
191 : // That rect (obviously) has to fit within the surface bounds, and
192 : // we should also minimize the extra pixels redrawn only for
193 : // alignment's sake. So we also want
194 : //
195 : // minimize <x',y', w',h'>
196 : // 0 <= x' <= x
197 : // 0 <= y' <= y
198 : // w <= w' <= sw
199 : // h <= h' <= sh
200 : //
201 : // This is a messy integer non-linear programming problem, except
202 : // ... we can assume that ALIGN/BPP is a very small constant. So,
203 : // brute force is viable. The algorithm below will find a
204 : // solution if one exists, but isn't guaranteed to find the
205 : // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
206 : // most 64 iterations below). In what's likely the common case,
207 : // an already-aligned rectangle, it only needs 1 iteration.
208 : //
209 : // Is this alignment worth doing? Recovering alpha will take work
210 : // proportional to w*h (assuming alpha recovery computation isn't
211 : // memory bound). This analysis can lead to O(w+h) extra work
212 : // (with small constants). In exchange, we expect to shave off a
213 : // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
214 : // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
215 : // only really care about the w*h >> w+h case anyway; others
216 : // should be fast enough even with the overhead. (Unless the cost
217 : // of repainting the expanded rect is high, but in that case
218 : // SIMD-ized alpha recovery won't make a difference so this code
219 : // shouldn't be called.)
220 : //
221 0 : gfxIntSize surfaceSize = aSurface->GetSize();
222 0 : const PRInt32 stride = bpp * surfaceSize.width;
223 0 : if (stride != aSurface->Stride()) {
224 0 : NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
225 0 : return aRect;
226 : }
227 :
228 0 : const PRInt32 x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
229 0 : const PRInt32 r = x + w;
230 0 : const PRInt32 sw = surfaceSize.width;
231 0 : const PRInt32 strideAlign = ByteAlignment(kByteAlignLog2, stride);
232 :
233 : // The outer two loops below keep the rightmost (|r| above) and
234 : // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
235 : // return only a superset of the original rect. These loops
236 : // search for an aligned top-left pixel by trying to expand <x,y>
237 : // left and up by <dx,dy> pixels, respectively.
238 : //
239 : // Then if a properly-aligned top-left pixel is found, the
240 : // innermost loop tries to find an aligned stride by moving the
241 : // rightmost pixel rightward by dr.
242 : PRInt32 dx, dy, dr;
243 0 : for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
244 0 : for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
245 0 : if (0 != ByteAlignment(kByteAlignLog2,
246 0 : bpp * (x - dx), y - dy, stride)) {
247 0 : continue;
248 : }
249 0 : for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
250 0 : if (strideAlign == ByteAlignment(kByteAlignLog2,
251 0 : bpp * (w + dr + dx))) {
252 0 : goto FOUND_SOLUTION;
253 : }
254 : }
255 : }
256 : }
257 :
258 : // Didn't find a solution.
259 0 : return aRect;
260 :
261 : FOUND_SOLUTION:
262 0 : nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy);
263 0 : NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution),
264 : "'Solution' extends outside surface bounds!");
265 0 : return solution;
266 : }
|