1 : #include <emmintrin.h>
2 :
3 : #include "qcmsint.h"
4 :
5 : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
6 : #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
7 : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
8 : static const ALIGN float floatScaleX4[4] =
9 : { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
10 : static const ALIGN float clampMaxValueX4[4] =
11 : { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
12 :
13 0 : void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
14 : unsigned char *src,
15 : unsigned char *dest,
16 : size_t length)
17 : {
18 : unsigned int i;
19 0 : float (*mat)[4] = transform->matrix;
20 : char input_back[32];
21 : /* Ensure we have a buffer that's 16 byte aligned regardless of the original
22 : * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
23 : * because they don't work on stack variables. gcc 4.4 does do the right thing
24 : * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
25 0 : float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
26 : /* share input and output locations to save having to keep the
27 : * locations in separate registers */
28 0 : uint32_t const * output = (uint32_t*)input;
29 :
30 : /* deref *transform now to avoid it in loop */
31 0 : const float *igtbl_r = transform->input_gamma_table_r;
32 0 : const float *igtbl_g = transform->input_gamma_table_g;
33 0 : const float *igtbl_b = transform->input_gamma_table_b;
34 :
35 : /* deref *transform now to avoid it in loop */
36 0 : const uint8_t *otdata_r = &transform->output_table_r->data[0];
37 0 : const uint8_t *otdata_g = &transform->output_table_g->data[0];
38 0 : const uint8_t *otdata_b = &transform->output_table_b->data[0];
39 :
40 : /* input matrix values never change */
41 0 : const __m128 mat0 = _mm_load_ps(mat[0]);
42 0 : const __m128 mat1 = _mm_load_ps(mat[1]);
43 0 : const __m128 mat2 = _mm_load_ps(mat[2]);
44 :
45 : /* these values don't change, either */
46 0 : const __m128 max = _mm_load_ps(clampMaxValueX4);
47 0 : const __m128 min = _mm_setzero_ps();
48 0 : const __m128 scale = _mm_load_ps(floatScaleX4);
49 :
50 : /* working variables */
51 : __m128 vec_r, vec_g, vec_b, result;
52 :
53 : /* CYA */
54 0 : if (!length)
55 0 : return;
56 :
57 : /* one pixel is handled outside of the loop */
58 0 : length--;
59 :
60 : /* setup for transforming 1st pixel */
61 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
62 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
63 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
64 0 : src += 3;
65 :
66 : /* transform all but final pixel */
67 :
68 0 : for (i=0; i<length; i++)
69 : {
70 : /* position values from gamma tables */
71 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
72 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
73 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
74 :
75 : /* gamma * matrix */
76 0 : vec_r = _mm_mul_ps(vec_r, mat0);
77 0 : vec_g = _mm_mul_ps(vec_g, mat1);
78 0 : vec_b = _mm_mul_ps(vec_b, mat2);
79 :
80 : /* crunch, crunch, crunch */
81 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
82 0 : vec_r = _mm_max_ps(min, vec_r);
83 0 : vec_r = _mm_min_ps(max, vec_r);
84 0 : result = _mm_mul_ps(vec_r, scale);
85 :
86 : /* store calc'd output tables indices */
87 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
88 :
89 : /* load for next loop while store completes */
90 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
91 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
92 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
93 0 : src += 3;
94 :
95 : /* use calc'd indices to output RGB values */
96 0 : dest[0] = otdata_r[output[0]];
97 0 : dest[1] = otdata_g[output[1]];
98 0 : dest[2] = otdata_b[output[2]];
99 0 : dest += 3;
100 : }
101 :
102 : /* handle final (maybe only) pixel */
103 :
104 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
105 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
106 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
107 :
108 0 : vec_r = _mm_mul_ps(vec_r, mat0);
109 0 : vec_g = _mm_mul_ps(vec_g, mat1);
110 0 : vec_b = _mm_mul_ps(vec_b, mat2);
111 :
112 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
113 0 : vec_r = _mm_max_ps(min, vec_r);
114 0 : vec_r = _mm_min_ps(max, vec_r);
115 0 : result = _mm_mul_ps(vec_r, scale);
116 :
117 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
118 :
119 0 : dest[0] = otdata_r[output[0]];
120 0 : dest[1] = otdata_g[output[1]];
121 0 : dest[2] = otdata_b[output[2]];
122 : }
123 :
124 0 : void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
125 : unsigned char *src,
126 : unsigned char *dest,
127 : size_t length)
128 : {
129 : unsigned int i;
130 0 : float (*mat)[4] = transform->matrix;
131 : char input_back[32];
132 : /* Ensure we have a buffer that's 16 byte aligned regardless of the original
133 : * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
134 : * because they don't work on stack variables. gcc 4.4 does do the right thing
135 : * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
136 0 : float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
137 : /* share input and output locations to save having to keep the
138 : * locations in separate registers */
139 0 : uint32_t const * output = (uint32_t*)input;
140 :
141 : /* deref *transform now to avoid it in loop */
142 0 : const float *igtbl_r = transform->input_gamma_table_r;
143 0 : const float *igtbl_g = transform->input_gamma_table_g;
144 0 : const float *igtbl_b = transform->input_gamma_table_b;
145 :
146 : /* deref *transform now to avoid it in loop */
147 0 : const uint8_t *otdata_r = &transform->output_table_r->data[0];
148 0 : const uint8_t *otdata_g = &transform->output_table_g->data[0];
149 0 : const uint8_t *otdata_b = &transform->output_table_b->data[0];
150 :
151 : /* input matrix values never change */
152 0 : const __m128 mat0 = _mm_load_ps(mat[0]);
153 0 : const __m128 mat1 = _mm_load_ps(mat[1]);
154 0 : const __m128 mat2 = _mm_load_ps(mat[2]);
155 :
156 : /* these values don't change, either */
157 0 : const __m128 max = _mm_load_ps(clampMaxValueX4);
158 0 : const __m128 min = _mm_setzero_ps();
159 0 : const __m128 scale = _mm_load_ps(floatScaleX4);
160 :
161 : /* working variables */
162 : __m128 vec_r, vec_g, vec_b, result;
163 : unsigned char alpha;
164 :
165 : /* CYA */
166 0 : if (!length)
167 0 : return;
168 :
169 : /* one pixel is handled outside of the loop */
170 0 : length--;
171 :
172 : /* setup for transforming 1st pixel */
173 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
174 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
175 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
176 0 : alpha = src[3];
177 0 : src += 4;
178 :
179 : /* transform all but final pixel */
180 :
181 0 : for (i=0; i<length; i++)
182 : {
183 : /* position values from gamma tables */
184 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
185 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
186 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
187 :
188 : /* gamma * matrix */
189 0 : vec_r = _mm_mul_ps(vec_r, mat0);
190 0 : vec_g = _mm_mul_ps(vec_g, mat1);
191 0 : vec_b = _mm_mul_ps(vec_b, mat2);
192 :
193 : /* store alpha for this pixel; load alpha for next */
194 0 : dest[3] = alpha;
195 0 : alpha = src[3];
196 :
197 : /* crunch, crunch, crunch */
198 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
199 0 : vec_r = _mm_max_ps(min, vec_r);
200 0 : vec_r = _mm_min_ps(max, vec_r);
201 0 : result = _mm_mul_ps(vec_r, scale);
202 :
203 : /* store calc'd output tables indices */
204 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
205 :
206 : /* load gamma values for next loop while store completes */
207 0 : vec_r = _mm_load_ss(&igtbl_r[src[0]]);
208 0 : vec_g = _mm_load_ss(&igtbl_g[src[1]]);
209 0 : vec_b = _mm_load_ss(&igtbl_b[src[2]]);
210 0 : src += 4;
211 :
212 : /* use calc'd indices to output RGB values */
213 0 : dest[0] = otdata_r[output[0]];
214 0 : dest[1] = otdata_g[output[1]];
215 0 : dest[2] = otdata_b[output[2]];
216 0 : dest += 4;
217 : }
218 :
219 : /* handle final (maybe only) pixel */
220 :
221 0 : vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
222 0 : vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
223 0 : vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
224 :
225 0 : vec_r = _mm_mul_ps(vec_r, mat0);
226 0 : vec_g = _mm_mul_ps(vec_g, mat1);
227 0 : vec_b = _mm_mul_ps(vec_b, mat2);
228 :
229 0 : dest[3] = alpha;
230 :
231 0 : vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
232 0 : vec_r = _mm_max_ps(min, vec_r);
233 0 : vec_r = _mm_min_ps(max, vec_r);
234 0 : result = _mm_mul_ps(vec_r, scale);
235 :
236 0 : _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
237 :
238 0 : dest[0] = otdata_r[output[0]];
239 0 : dest[1] = otdata_g[output[1]];
240 0 : dest[2] = otdata_b[output[2]];
241 : }
242 :
243 :
|