LCOV - code coverage report
Current view: directory - gfx/qcms - transform-sse2.c (source / functions) Found Hit Coverage
Test: app.info Lines: 118 0 0.0 %
Date: 2012-06-02 Functions: 2 0 0.0 %

       1                 : #include <emmintrin.h>
       2                 : 
       3                 : #include "qcmsint.h"
       4                 : 
       5                 : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
       6                 : #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
       7                 : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
       8                 : static const ALIGN float floatScaleX4[4] =
       9                 :     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
      10                 : static const ALIGN float clampMaxValueX4[4] =
      11                 :     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
      12                 : 
      13               0 : void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
      14                 :                                           unsigned char *src,
      15                 :                                           unsigned char *dest,
      16                 :                                           size_t length)
      17                 : {
      18                 :     unsigned int i;
      19               0 :     float (*mat)[4] = transform->matrix;
      20                 :     char input_back[32];
      21                 :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      22                 :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      23                 :      * because they don't work on stack variables. gcc 4.4 does do the right thing
      24                 :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
      25               0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
      26                 :     /* share input and output locations to save having to keep the
      27                 :      * locations in separate registers */
      28               0 :     uint32_t const * output = (uint32_t*)input;
      29                 : 
      30                 :     /* deref *transform now to avoid it in loop */
      31               0 :     const float *igtbl_r = transform->input_gamma_table_r;
      32               0 :     const float *igtbl_g = transform->input_gamma_table_g;
      33               0 :     const float *igtbl_b = transform->input_gamma_table_b;
      34                 : 
      35                 :     /* deref *transform now to avoid it in loop */
      36               0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
      37               0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
      38               0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
      39                 : 
      40                 :     /* input matrix values never change */
      41               0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
      42               0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
      43               0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
      44                 : 
      45                 :     /* these values don't change, either */
      46               0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
      47               0 :     const __m128 min   = _mm_setzero_ps();
      48               0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
      49                 : 
      50                 :     /* working variables */
      51                 :     __m128 vec_r, vec_g, vec_b, result;
      52                 : 
      53                 :     /* CYA */
      54               0 :     if (!length)
      55               0 :         return;
      56                 : 
      57                 :     /* one pixel is handled outside of the loop */
      58               0 :     length--;
      59                 : 
      60                 :     /* setup for transforming 1st pixel */
      61               0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      62               0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      63               0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      64               0 :     src += 3;
      65                 : 
      66                 :     /* transform all but final pixel */
      67                 : 
      68               0 :     for (i=0; i<length; i++)
      69                 :     {
      70                 :         /* position values from gamma tables */
      71               0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
      72               0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
      73               0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
      74                 : 
      75                 :         /* gamma * matrix */
      76               0 :         vec_r = _mm_mul_ps(vec_r, mat0);
      77               0 :         vec_g = _mm_mul_ps(vec_g, mat1);
      78               0 :         vec_b = _mm_mul_ps(vec_b, mat2);
      79                 : 
      80                 :         /* crunch, crunch, crunch */
      81               0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
      82               0 :         vec_r  = _mm_max_ps(min, vec_r);
      83               0 :         vec_r  = _mm_min_ps(max, vec_r);
      84               0 :         result = _mm_mul_ps(vec_r, scale);
      85                 : 
      86                 :         /* store calc'd output tables indices */
      87               0 :         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
      88                 : 
      89                 :         /* load for next loop while store completes */
      90               0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      91               0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      92               0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      93               0 :         src += 3;
      94                 : 
      95                 :         /* use calc'd indices to output RGB values */
      96               0 :         dest[0] = otdata_r[output[0]];
      97               0 :         dest[1] = otdata_g[output[1]];
      98               0 :         dest[2] = otdata_b[output[2]];
      99               0 :         dest += 3;
     100                 :     }
     101                 : 
     102                 :     /* handle final (maybe only) pixel */
     103                 : 
     104               0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     105               0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     106               0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     107                 : 
     108               0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     109               0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     110               0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     111                 : 
     112               0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     113               0 :     vec_r  = _mm_max_ps(min, vec_r);
     114               0 :     vec_r  = _mm_min_ps(max, vec_r);
     115               0 :     result = _mm_mul_ps(vec_r, scale);
     116                 : 
     117               0 :     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     118                 : 
     119               0 :     dest[0] = otdata_r[output[0]];
     120               0 :     dest[1] = otdata_g[output[1]];
     121               0 :     dest[2] = otdata_b[output[2]];
     122                 : }
     123                 : 
     124               0 : void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
     125                 :                                            unsigned char *src,
     126                 :                                            unsigned char *dest,
     127                 :                                            size_t length)
     128                 : {
     129                 :     unsigned int i;
     130               0 :     float (*mat)[4] = transform->matrix;
     131                 :     char input_back[32];
     132                 :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     133                 :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     134                 :      * because they don't work on stack variables. gcc 4.4 does do the right thing
     135                 :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     136               0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     137                 :     /* share input and output locations to save having to keep the
     138                 :      * locations in separate registers */
     139               0 :     uint32_t const * output = (uint32_t*)input;
     140                 : 
     141                 :     /* deref *transform now to avoid it in loop */
     142               0 :     const float *igtbl_r = transform->input_gamma_table_r;
     143               0 :     const float *igtbl_g = transform->input_gamma_table_g;
     144               0 :     const float *igtbl_b = transform->input_gamma_table_b;
     145                 : 
     146                 :     /* deref *transform now to avoid it in loop */
     147               0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
     148               0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
     149               0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
     150                 : 
     151                 :     /* input matrix values never change */
     152               0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
     153               0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
     154               0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
     155                 : 
     156                 :     /* these values don't change, either */
     157               0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     158               0 :     const __m128 min   = _mm_setzero_ps();
     159               0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
     160                 : 
     161                 :     /* working variables */
     162                 :     __m128 vec_r, vec_g, vec_b, result;
     163                 :     unsigned char alpha;
     164                 : 
     165                 :     /* CYA */
     166               0 :     if (!length)
     167               0 :         return;
     168                 : 
     169                 :     /* one pixel is handled outside of the loop */
     170               0 :     length--;
     171                 : 
     172                 :     /* setup for transforming 1st pixel */
     173               0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     174               0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     175               0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     176               0 :     alpha = src[3];
     177               0 :     src += 4;
     178                 : 
     179                 :     /* transform all but final pixel */
     180                 : 
     181               0 :     for (i=0; i<length; i++)
     182                 :     {
     183                 :         /* position values from gamma tables */
     184               0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     185               0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     186               0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     187                 : 
     188                 :         /* gamma * matrix */
     189               0 :         vec_r = _mm_mul_ps(vec_r, mat0);
     190               0 :         vec_g = _mm_mul_ps(vec_g, mat1);
     191               0 :         vec_b = _mm_mul_ps(vec_b, mat2);
     192                 : 
     193                 :         /* store alpha for this pixel; load alpha for next */
     194               0 :         dest[3] = alpha;
     195               0 :         alpha   = src[3];
     196                 : 
     197                 :         /* crunch, crunch, crunch */
     198               0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     199               0 :         vec_r  = _mm_max_ps(min, vec_r);
     200               0 :         vec_r  = _mm_min_ps(max, vec_r);
     201               0 :         result = _mm_mul_ps(vec_r, scale);
     202                 : 
     203                 :         /* store calc'd output tables indices */
     204               0 :         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     205                 : 
     206                 :         /* load gamma values for next loop while store completes */
     207               0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     208               0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     209               0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     210               0 :         src += 4;
     211                 : 
     212                 :         /* use calc'd indices to output RGB values */
     213               0 :         dest[0] = otdata_r[output[0]];
     214               0 :         dest[1] = otdata_g[output[1]];
     215               0 :         dest[2] = otdata_b[output[2]];
     216               0 :         dest += 4;
     217                 :     }
     218                 : 
     219                 :     /* handle final (maybe only) pixel */
     220                 : 
     221               0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     222               0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     223               0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     224                 : 
     225               0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     226               0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     227               0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     228                 : 
     229               0 :     dest[3] = alpha;
     230                 : 
     231               0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     232               0 :     vec_r  = _mm_max_ps(min, vec_r);
     233               0 :     vec_r  = _mm_min_ps(max, vec_r);
     234               0 :     result = _mm_mul_ps(vec_r, scale);
     235                 : 
     236               0 :     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
     237                 : 
     238               0 :     dest[0] = otdata_r[output[0]];
     239               0 :     dest[1] = otdata_g[output[1]];
     240               0 :     dest[2] = otdata_b[output[2]];
     241                 : }
     242                 : 
     243                 : 

Generated by: LCOV version 1.7