LCOV - code coverage report
Current view: directory - gfx/qcms - transform-sse1.c (source / functions) Found Hit Coverage
Test: app.info Lines: 126 0 0.0 %
Date: 2012-06-02 Functions: 2 0 0.0 %

       1                 : #include <xmmintrin.h>
       2                 : 
       3                 : #include "qcmsint.h"
       4                 : 
       5                 : /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
       6                 : #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
       7                 : #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
       8                 : static const ALIGN float floatScaleX4[4] =
       9                 :     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
      10                 : static const ALIGN float clampMaxValueX4[4] =
      11                 :     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
      12                 : 
      13               0 : void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
      14                 :                                           unsigned char *src,
      15                 :                                           unsigned char *dest,
      16                 :                                           size_t length)
      17                 : {
      18                 :     unsigned int i;
      19               0 :     float (*mat)[4] = transform->matrix;
      20                 :     char input_back[32];
      21                 :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      22                 :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      23                 :      * because they don't work on stack variables. gcc 4.4 does do the right thing
      24                 :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
      25               0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
      26                 :     /* share input and output locations to save having to keep the
      27                 :      * locations in separate registers */
      28               0 :     uint32_t const * output = (uint32_t*)input;
      29                 : 
      30                 :     /* deref *transform now to avoid it in loop */
      31               0 :     const float *igtbl_r = transform->input_gamma_table_r;
      32               0 :     const float *igtbl_g = transform->input_gamma_table_g;
      33               0 :     const float *igtbl_b = transform->input_gamma_table_b;
      34                 : 
      35                 :     /* deref *transform now to avoid it in loop */
      36               0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
      37               0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
      38               0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
      39                 : 
      40                 :     /* input matrix values never change */
      41               0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
      42               0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
      43               0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
      44                 : 
      45                 :     /* these values don't change, either */
      46               0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
      47               0 :     const __m128 min   = _mm_setzero_ps();
      48               0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
      49                 : 
      50                 :     /* working variables */
      51                 :     __m128 vec_r, vec_g, vec_b, result;
      52                 : 
      53                 :     /* CYA */
      54               0 :     if (!length)
      55               0 :         return;
      56                 : 
      57                 :     /* one pixel is handled outside of the loop */
      58               0 :     length--;
      59                 : 
      60                 :     /* setup for transforming 1st pixel */
      61               0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      62               0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      63               0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      64               0 :     src += 3;
      65                 : 
      66                 :     /* transform all but final pixel */
      67                 : 
      68               0 :     for (i=0; i<length; i++)
      69                 :     {
      70                 :         /* position values from gamma tables */
      71               0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
      72               0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
      73               0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
      74                 : 
      75                 :         /* gamma * matrix */
      76               0 :         vec_r = _mm_mul_ps(vec_r, mat0);
      77               0 :         vec_g = _mm_mul_ps(vec_g, mat1);
      78               0 :         vec_b = _mm_mul_ps(vec_b, mat2);
      79                 : 
      80                 :         /* crunch, crunch, crunch */
      81               0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
      82               0 :         vec_r  = _mm_max_ps(min, vec_r);
      83               0 :         vec_r  = _mm_min_ps(max, vec_r);
      84               0 :         result = _mm_mul_ps(vec_r, scale);
      85                 : 
      86                 :         /* store calc'd output tables indices */
      87               0 :         *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
      88               0 :         result = _mm_movehl_ps(result, result);
      89               0 :         *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
      90                 : 
      91                 :         /* load for next loop while store completes */
      92               0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
      93               0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
      94               0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
      95               0 :         src += 3;
      96                 : 
      97                 :         /* use calc'd indices to output RGB values */
      98               0 :         dest[0] = otdata_r[output[0]];
      99               0 :         dest[1] = otdata_g[output[1]];
     100               0 :         dest[2] = otdata_b[output[2]];
     101               0 :         dest += 3;
     102                 :     }
     103                 : 
     104                 :     /* handle final (maybe only) pixel */
     105                 : 
     106               0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     107               0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     108               0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     109                 : 
     110               0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     111               0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     112               0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     113                 : 
     114               0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     115               0 :     vec_r  = _mm_max_ps(min, vec_r);
     116               0 :     vec_r  = _mm_min_ps(max, vec_r);
     117               0 :     result = _mm_mul_ps(vec_r, scale);
     118                 : 
     119               0 :     *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     120               0 :     result = _mm_movehl_ps(result, result);
     121               0 :     *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     122                 : 
     123               0 :     dest[0] = otdata_r[output[0]];
     124               0 :     dest[1] = otdata_g[output[1]];
     125               0 :     dest[2] = otdata_b[output[2]];
     126                 : 
     127                 :     _mm_empty();
     128                 : }
     129                 : 
     130               0 : void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
     131                 :                                            unsigned char *src,
     132                 :                                            unsigned char *dest,
     133                 :                                            size_t length)
     134                 : {
     135                 :     unsigned int i;
     136               0 :     float (*mat)[4] = transform->matrix;
     137                 :     char input_back[32];
     138                 :     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
     139                 :      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
     140                 :      * because they don't work on stack variables. gcc 4.4 does do the right thing
     141                 :      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     142               0 :     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     143                 :     /* share input and output locations to save having to keep the
     144                 :      * locations in separate registers */
     145               0 :     uint32_t const * output = (uint32_t*)input;
     146                 : 
     147                 :     /* deref *transform now to avoid it in loop */
     148               0 :     const float *igtbl_r = transform->input_gamma_table_r;
     149               0 :     const float *igtbl_g = transform->input_gamma_table_g;
     150               0 :     const float *igtbl_b = transform->input_gamma_table_b;
     151                 : 
     152                 :     /* deref *transform now to avoid it in loop */
     153               0 :     const uint8_t *otdata_r = &transform->output_table_r->data[0];
     154               0 :     const uint8_t *otdata_g = &transform->output_table_g->data[0];
     155               0 :     const uint8_t *otdata_b = &transform->output_table_b->data[0];
     156                 : 
     157                 :     /* input matrix values never change */
     158               0 :     const __m128 mat0  = _mm_load_ps(mat[0]);
     159               0 :     const __m128 mat1  = _mm_load_ps(mat[1]);
     160               0 :     const __m128 mat2  = _mm_load_ps(mat[2]);
     161                 : 
     162                 :     /* these values don't change, either */
     163               0 :     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     164               0 :     const __m128 min   = _mm_setzero_ps();
     165               0 :     const __m128 scale = _mm_load_ps(floatScaleX4);
     166                 : 
     167                 :     /* working variables */
     168                 :     __m128 vec_r, vec_g, vec_b, result;
     169                 :     unsigned char alpha;
     170                 : 
     171                 :     /* CYA */
     172               0 :     if (!length)
     173               0 :         return;
     174                 : 
     175                 :     /* one pixel is handled outside of the loop */
     176               0 :     length--;
     177                 : 
     178                 :     /* setup for transforming 1st pixel */
     179               0 :     vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     180               0 :     vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     181               0 :     vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     182               0 :     alpha = src[3];
     183               0 :     src += 4;
     184                 : 
     185                 :     /* transform all but final pixel */
     186                 : 
     187               0 :     for (i=0; i<length; i++)
     188                 :     {
     189                 :         /* position values from gamma tables */
     190               0 :         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     191               0 :         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     192               0 :         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     193                 : 
     194                 :         /* gamma * matrix */
     195               0 :         vec_r = _mm_mul_ps(vec_r, mat0);
     196               0 :         vec_g = _mm_mul_ps(vec_g, mat1);
     197               0 :         vec_b = _mm_mul_ps(vec_b, mat2);
     198                 : 
     199                 :         /* store alpha for this pixel; load alpha for next */
     200               0 :         dest[3] = alpha;
     201               0 :         alpha   = src[3];
     202                 : 
     203                 :         /* crunch, crunch, crunch */
     204               0 :         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     205               0 :         vec_r  = _mm_max_ps(min, vec_r);
     206               0 :         vec_r  = _mm_min_ps(max, vec_r);
     207               0 :         result = _mm_mul_ps(vec_r, scale);
     208                 : 
     209                 :         /* store calc'd output tables indices */
     210               0 :         *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     211               0 :         result = _mm_movehl_ps(result, result);
     212               0 :         *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     213                 : 
     214                 :         /* load gamma values for next loop while store completes */
     215               0 :         vec_r = _mm_load_ss(&igtbl_r[src[0]]);
     216               0 :         vec_g = _mm_load_ss(&igtbl_g[src[1]]);
     217               0 :         vec_b = _mm_load_ss(&igtbl_b[src[2]]);
     218               0 :         src += 4;
     219                 : 
     220                 :         /* use calc'd indices to output RGB values */
     221               0 :         dest[0] = otdata_r[output[0]];
     222               0 :         dest[1] = otdata_g[output[1]];
     223               0 :         dest[2] = otdata_b[output[2]];
     224               0 :         dest += 4;
     225                 :     }
     226                 : 
     227                 :     /* handle final (maybe only) pixel */
     228                 : 
     229               0 :     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     230               0 :     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     231               0 :     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
     232                 : 
     233               0 :     vec_r = _mm_mul_ps(vec_r, mat0);
     234               0 :     vec_g = _mm_mul_ps(vec_g, mat1);
     235               0 :     vec_b = _mm_mul_ps(vec_b, mat2);
     236                 : 
     237               0 :     dest[3] = alpha;
     238                 : 
     239               0 :     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     240               0 :     vec_r  = _mm_max_ps(min, vec_r);
     241               0 :     vec_r  = _mm_min_ps(max, vec_r);
     242               0 :     result = _mm_mul_ps(vec_r, scale);
     243                 : 
     244               0 :     *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     245               0 :     result = _mm_movehl_ps(result, result);
     246               0 :     *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
     247                 : 
     248               0 :     dest[0] = otdata_r[output[0]];
     249               0 :     dest[1] = otdata_g[output[1]];
     250               0 :     dest[2] = otdata_b[output[2]];
     251                 : 
     252                 :     _mm_empty();
     253                 : }

Generated by: LCOV version 1.7