LCOV - code coverage report
Current view: directory - media/libtheora/lib/x86 - sse2idct.c (source / functions) Found Hit Coverage
Test: app.info Lines: 25 0 0.0 %
Date: 2012-06-02 Functions: 3 0 0.0 %

       1                 : /********************************************************************
       2                 :  *                                                                  *
       3                 :  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
       4                 :  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
       5                 :  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
       6                 :  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
       7                 :  *                                                                  *
       8                 :  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
       9                 :  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
      10                 :  *                                                                  *
      11                 :  ********************************************************************
      12                 : 
      13                 :   function:
      14                 :     last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
      15                 : 
      16                 :  ********************************************************************/
      17                 : 
      18                 : /*SSE2 acceleration of Theora's iDCT.*/
      19                 : #include "x86int.h"
      20                 : #include "sse2trans.h"
      21                 : #include "../dct.h"
      22                 : 
      23                 : #if defined(OC_X86_ASM)
      24                 : 
      25                 : /*A table of constants used by the MMX routines.*/
      26                 : const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
      27                 :         8,      8,      8,      8,      8,      8,      8,      8,
      28                 :   OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
      29                 :   OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
      30                 :   OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
      31                 :   OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
      32                 :   OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
      33                 :   OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
      34                 :   OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
      35                 : };
      36                 : 
      37                 : 
      38                 : /*Performs the first three stages of the iDCT.
      39                 :   xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
      40                 :    (accessed in that order).
      41                 :   The remaining rows must be in _x at their corresponding locations.
      42                 :   On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
      43                 :    contain rows 4 through 7.*/
      44                 : #define OC_IDCT_8x8_ABC(_x) \
      45                 :   "#OC_IDCT_8x8_ABC\n\t" \
      46                 :   /*Stage 1:*/ \
      47                 :   /*2-3 rotation by 6pi/16. \
      48                 :     xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
      49                 :   "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
      50                 :   "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
      51                 :   "movdqa %%xmm1,%%xmm0\n\t" \
      52                 :   "pmulhw %%xmm2,%%xmm1\n\t" \
      53                 :   "movdqa %%xmm4,%%xmm7\n\t" \
      54                 :   "pmulhw %%xmm6,%%xmm0\n\t" \
      55                 :   "pmulhw %%xmm2,%%xmm7\n\t" \
      56                 :   "pmulhw %%xmm6,%%xmm4\n\t" \
      57                 :   "paddw %%xmm6,%%xmm0\n\t" \
      58                 :   "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
      59                 :   "paddw %%xmm1,%%xmm2\n\t" \
      60                 :   "psubw %%xmm0,%%xmm7\n\t" \
      61                 :   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
      62                 :   "paddw %%xmm4,%%xmm2\n\t" \
      63                 :   "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
      64                 :   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
      65                 :   /*5-6 rotation by 3pi/16. \
      66                 :     xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
      67                 :   "movdqa %%xmm4,%%xmm2\n\t" \
      68                 :   "movdqa %%xmm6,%%xmm1\n\t" \
      69                 :   "pmulhw %%xmm3,%%xmm4\n\t" \
      70                 :   "pmulhw %%xmm5,%%xmm1\n\t" \
      71                 :   "pmulhw %%xmm3,%%xmm6\n\t" \
      72                 :   "pmulhw %%xmm5,%%xmm2\n\t" \
      73                 :   "paddw %%xmm3,%%xmm4\n\t" \
      74                 :   "paddw %%xmm5,%%xmm3\n\t" \
      75                 :   "paddw %%xmm6,%%xmm3\n\t" \
      76                 :   "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
      77                 :   "paddw %%xmm5,%%xmm1\n\t" \
      78                 :   "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
      79                 :   "paddw %%xmm3,%%xmm2\n\t" \
      80                 :   "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
      81                 :   "psubw %%xmm4,%%xmm1\n\t" \
      82                 :   "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
      83                 :   /*4-7 rotation by 7pi/16. \
      84                 :     xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
      85                 :   "movdqa %%xmm3,%%xmm0\n\t" \
      86                 :   "movdqa %%xmm4,%%xmm7\n\t" \
      87                 :   "pmulhw %%xmm5,%%xmm3\n\t" \
      88                 :   "pmulhw %%xmm5,%%xmm7\n\t" \
      89                 :   "pmulhw %%xmm6,%%xmm4\n\t" \
      90                 :   "pmulhw %%xmm6,%%xmm0\n\t" \
      91                 :   "paddw %%xmm6,%%xmm4\n\t" \
      92                 :   "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
      93                 :   "paddw %%xmm5,%%xmm7\n\t" \
      94                 :   "psubw %%xmm4,%%xmm3\n\t" \
      95                 :   "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
      96                 :   "paddw %%xmm7,%%xmm0\n\t" \
      97                 :   "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
      98                 :   /*0-1 butterfly. \
      99                 :     xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
     100                 :   "paddw %%xmm7,%%xmm6\n\t" \
     101                 :   "movdqa %%xmm4,%%xmm5\n\t" \
     102                 :   "pmulhw %%xmm6,%%xmm4\n\t" \
     103                 :   "paddw %%xmm7,%%xmm7\n\t" \
     104                 :   "psubw %%xmm6,%%xmm7\n\t" \
     105                 :   "paddw %%xmm6,%%xmm4\n\t" \
     106                 :   /*Stage 2:*/ \
     107                 :   /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
     108                 :     7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
     109                 :   "movdqa %%xmm3,%%xmm6\n\t" \
     110                 :   "paddw %%xmm1,%%xmm3\n\t" \
     111                 :   "psubw %%xmm1,%%xmm6\n\t" \
     112                 :   "movdqa %%xmm5,%%xmm1\n\t" \
     113                 :   "pmulhw %%xmm7,%%xmm5\n\t" \
     114                 :   "paddw %%xmm7,%%xmm5\n\t" \
     115                 :   "movdqa %%xmm0,%%xmm7\n\t" \
     116                 :   "paddw %%xmm2,%%xmm0\n\t" \
     117                 :   "psubw %%xmm2,%%xmm7\n\t" \
     118                 :   "movdqa %%xmm1,%%xmm2\n\t" \
     119                 :   "pmulhw %%xmm6,%%xmm1\n\t" \
     120                 :   "pmulhw %%xmm7,%%xmm2\n\t" \
     121                 :   "paddw %%xmm6,%%xmm1\n\t" \
     122                 :   "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
     123                 :   "paddw %%xmm7,%%xmm2\n\t" \
     124                 :   "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
     125                 :   /*Stage 3: \
     126                 :     6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
     127                 :     0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
     128                 :     1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
     129                 :   "paddw %%xmm2,%%xmm1\n\t" \
     130                 :   "paddw %%xmm5,%%xmm6\n\t" \
     131                 :   "paddw %%xmm4,%%xmm7\n\t" \
     132                 :   "paddw %%xmm2,%%xmm2\n\t" \
     133                 :   "paddw %%xmm4,%%xmm4\n\t" \
     134                 :   "paddw %%xmm5,%%xmm5\n\t" \
     135                 :   "psubw %%xmm1,%%xmm2\n\t" \
     136                 :   "psubw %%xmm7,%%xmm4\n\t" \
     137                 :   "psubw %%xmm6,%%xmm5\n\t" \
     138                 : 
     139                 : /*Performs the last stage of the iDCT.
     140                 :   On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
     141                 :    contain rows 4 through 7.
     142                 :   On output, xmm0 through xmm7 contain the corresponding rows.*/
     143                 : #define OC_IDCT_8x8_D \
     144                 :   "#OC_IDCT_8x8_D\n\t" \
     145                 :   /*Stage 4: \
     146                 :     0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
     147                 :     1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
     148                 :     2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
     149                 :     3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
     150                 :   "psubw %%xmm0,%%xmm7\n\t" \
     151                 :   "psubw %%xmm1,%%xmm6\n\t" \
     152                 :   "psubw %%xmm2,%%xmm5\n\t" \
     153                 :   "psubw %%xmm3,%%xmm4\n\t" \
     154                 :   "paddw %%xmm0,%%xmm0\n\t" \
     155                 :   "paddw %%xmm1,%%xmm1\n\t" \
     156                 :   "paddw %%xmm2,%%xmm2\n\t" \
     157                 :   "paddw %%xmm3,%%xmm3\n\t" \
     158                 :   "paddw %%xmm7,%%xmm0\n\t" \
     159                 :   "paddw %%xmm6,%%xmm1\n\t" \
     160                 :   "paddw %%xmm5,%%xmm2\n\t" \
     161                 :   "paddw %%xmm4,%%xmm3\n\t" \
     162                 : 
     163                 : /*Performs the last stage of the iDCT.
     164                 :   On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
     165                 :    contain rows 4 through 7.
     166                 :   On output, xmm0 through xmm7 contain the corresponding rows.*/
     167                 : #define OC_IDCT_8x8_D_STORE \
     168                 :   "#OC_IDCT_8x8_D_STORE\n\t" \
     169                 :   /*Stage 4: \
     170                 :     0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
     171                 :     1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
     172                 :     2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
     173                 :     3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
     174                 :   "psubw %%xmm3,%%xmm4\n\t" \
     175                 :   "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
     176                 :   "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
     177                 :   "psubw %%xmm0,%%xmm7\n\t" \
     178                 :   "psubw %%xmm1,%%xmm6\n\t" \
     179                 :   "psubw %%xmm2,%%xmm5\n\t" \
     180                 :   "paddw %%xmm4,%%xmm7\n\t" \
     181                 :   "paddw %%xmm4,%%xmm6\n\t" \
     182                 :   "paddw %%xmm4,%%xmm5\n\t" \
     183                 :   "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
     184                 :   "paddw %%xmm0,%%xmm0\n\t" \
     185                 :   "paddw %%xmm1,%%xmm1\n\t" \
     186                 :   "paddw %%xmm2,%%xmm2\n\t" \
     187                 :   "paddw %%xmm3,%%xmm3\n\t" \
     188                 :   "paddw %%xmm7,%%xmm0\n\t" \
     189                 :   "paddw %%xmm6,%%xmm1\n\t" \
     190                 :   "psraw $4,%%xmm0\n\t" \
     191                 :   "paddw %%xmm5,%%xmm2\n\t" \
     192                 :   "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
     193                 :   "psraw $4,%%xmm1\n\t" \
     194                 :   "paddw %%xmm4,%%xmm3\n\t" \
     195                 :   "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
     196                 :   "psraw $4,%%xmm2\n\t" \
     197                 :   "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
     198                 :   "psraw $4,%%xmm3\n\t" \
     199                 :   "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
     200                 :   "psraw $4,%%xmm4\n\t" \
     201                 :   "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
     202                 :   "psraw $4,%%xmm5\n\t" \
     203                 :   "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
     204                 :   "psraw $4,%%xmm6\n\t" \
     205                 :   "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
     206                 :   "psraw $4,%%xmm7\n\t" \
     207                 :   "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
     208                 : 
     209               0 : static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     210                 :   OC_ALIGN16(ogg_int16_t buf[16]);
     211                 :   /*This routine accepts an 8x8 matrix pre-transposed.*/
     212               0 :   __asm__ __volatile__(
     213                 :     /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
     214                 :     "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
     215                 :     "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
     216                 :     "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
     217                 :     "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
     218                 :     OC_IDCT_8x8_ABC(x)
     219                 :     OC_IDCT_8x8_D
     220                 :     OC_TRANSPOSE_8x8
     221                 :     /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
     222                 :     "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
     223                 :     "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
     224                 :     "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
     225                 :     "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
     226                 :     OC_IDCT_8x8_ABC(y)
     227                 :     OC_IDCT_8x8_D_STORE
     228               0 :     :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
     229               0 :      [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
     230               0 :     :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
     231               0 :      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
     232                 :   );
     233               0 :   if(_x!=_y){
     234                 :     int i;
     235               0 :     __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
     236                 :     /*Clear input data for next block (decoder only).*/
     237               0 :     for(i=0;i<2;i++){
     238               0 :       __asm__ __volatile__(
     239                 :         "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     240                 :         "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     241                 :         "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
     242                 :         "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
     243               0 :         :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
     244                 :       );
     245                 :     }
     246                 :   }
     247               0 : }
     248                 : 
     249                 : /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
     250                 :    need to work with four columns at a time.
     251                 :   Doing this in MMX is faster on processors with a 64-bit data path.*/
     252                 : #define OC_IDCT_8x8_10_MMX \
     253                 :   "#OC_IDCT_8x8_10_MMX\n\t" \
     254                 :   /*Stage 1:*/ \
     255                 :   /*2-3 rotation by 6pi/16. \
     256                 :     mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
     257                 :   "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
     258                 :   "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
     259                 :   "pmulhw %%mm2,%%mm6\n\t" \
     260                 :   "pmulhw %%mm2,%%mm7\n\t" \
     261                 :   "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
     262                 :   "paddw %%mm6,%%mm2\n\t" \
     263                 :   "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
     264                 :   "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
     265                 :   "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
     266                 :   /*5-6 rotation by 3pi/16. \
     267                 :     mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
     268                 :   "pmulhw %%mm3,%%mm5\n\t" \
     269                 :   "pmulhw %%mm3,%%mm2\n\t" \
     270                 :   "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
     271                 :   "paddw %%mm3,%%mm5\n\t" \
     272                 :   "paddw %%mm3,%%mm2\n\t" \
     273                 :   "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
     274                 :   /*4-7 rotation by 7pi/16. \
     275                 :     mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
     276                 :   "pmulhw %%mm1,%%mm3\n\t" \
     277                 :   "pmulhw %%mm1,%%mm7\n\t" \
     278                 :   "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
     279                 :   "movq %%mm3,%%mm6\n\t" \
     280                 :   "paddw %%mm1,%%mm7\n\t" \
     281                 :   /*0-1 butterfly. \
     282                 :     mm4=C4, mm0=X0, X4=0.*/ \
     283                 :   /*Stage 2:*/ \
     284                 :   /*4-5 butterfly: mm3=t[4], mm5=t[5] \
     285                 :     7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
     286                 :   "psubw %%mm5,%%mm3\n\t" \
     287                 :   "paddw %%mm5,%%mm6\n\t" \
     288                 :   "movq %%mm4,%%mm1\n\t" \
     289                 :   "pmulhw %%mm0,%%mm4\n\t" \
     290                 :   "paddw %%mm0,%%mm4\n\t" \
     291                 :   "movq %%mm7,%%mm0\n\t" \
     292                 :   "movq %%mm4,%%mm5\n\t" \
     293                 :   "paddw %%mm2,%%mm0\n\t" \
     294                 :   "psubw %%mm2,%%mm7\n\t" \
     295                 :   "movq %%mm1,%%mm2\n\t" \
     296                 :   "pmulhw %%mm6,%%mm1\n\t" \
     297                 :   "pmulhw %%mm7,%%mm2\n\t" \
     298                 :   "paddw %%mm6,%%mm1\n\t" \
     299                 :   "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
     300                 :   "paddw %%mm7,%%mm2\n\t" \
     301                 :   "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
     302                 :   /*Stage 3: \
     303                 :     6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
     304                 :     0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
     305                 :     1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
     306                 :   "paddw %%mm2,%%mm1\n\t" \
     307                 :   "paddw %%mm5,%%mm6\n\t" \
     308                 :   "paddw %%mm4,%%mm7\n\t" \
     309                 :   "paddw %%mm2,%%mm2\n\t" \
     310                 :   "paddw %%mm4,%%mm4\n\t" \
     311                 :   "paddw %%mm5,%%mm5\n\t" \
     312                 :   "psubw %%mm1,%%mm2\n\t" \
     313                 :   "psubw %%mm7,%%mm4\n\t" \
     314                 :   "psubw %%mm6,%%mm5\n\t" \
     315                 :   /*Stage 4: \
     316                 :     0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
     317                 :     1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
     318                 :     2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
     319                 :     3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
     320                 :   "psubw %%mm0,%%mm7\n\t" \
     321                 :   "psubw %%mm1,%%mm6\n\t" \
     322                 :   "psubw %%mm2,%%mm5\n\t" \
     323                 :   "psubw %%mm3,%%mm4\n\t" \
     324                 :   "paddw %%mm0,%%mm0\n\t" \
     325                 :   "paddw %%mm1,%%mm1\n\t" \
     326                 :   "paddw %%mm2,%%mm2\n\t" \
     327                 :   "paddw %%mm3,%%mm3\n\t" \
     328                 :   "paddw %%mm7,%%mm0\n\t" \
     329                 :   "paddw %%mm6,%%mm1\n\t" \
     330                 :   "paddw %%mm5,%%mm2\n\t" \
     331                 :   "paddw %%mm4,%%mm3\n\t" \
     332                 : 
     333                 : #define OC_IDCT_8x8_10_ABC \
     334                 :   "#OC_IDCT_8x8_10_ABC\n\t" \
     335                 :   /*Stage 1:*/ \
     336                 :   /*2-3 rotation by 6pi/16. \
     337                 :     xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
     338                 :   "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
     339                 :   "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
     340                 :   "pmulhw %%xmm2,%%xmm6\n\t" \
     341                 :   "pmulhw %%xmm2,%%xmm7\n\t" \
     342                 :   "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
     343                 :   "paddw %%xmm6,%%xmm2\n\t" \
     344                 :   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
     345                 :   "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
     346                 :   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
     347                 :   /*5-6 rotation by 3pi/16. \
     348                 :     xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
     349                 :   "pmulhw %%xmm3,%%xmm5\n\t" \
     350                 :   "pmulhw %%xmm3,%%xmm2\n\t" \
     351                 :   "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
     352                 :   "paddw %%xmm3,%%xmm5\n\t" \
     353                 :   "paddw %%xmm3,%%xmm2\n\t" \
     354                 :   "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
     355                 :   /*4-7 rotation by 7pi/16. \
     356                 :     xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
     357                 :   "pmulhw %%xmm1,%%xmm3\n\t" \
     358                 :   "pmulhw %%xmm1,%%xmm7\n\t" \
     359                 :   "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
     360                 :   "movdqa %%xmm3,%%xmm6\n\t" \
     361                 :   "paddw %%xmm1,%%xmm7\n\t" \
     362                 :   /*0-1 butterfly. \
     363                 :     xmm4=C4, xmm0=X0, X4=0.*/ \
     364                 :   /*Stage 2:*/ \
     365                 :   /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
     366                 :     7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
     367                 :   "psubw %%xmm5,%%xmm3\n\t" \
     368                 :   "paddw %%xmm5,%%xmm6\n\t" \
     369                 :   "movdqa %%xmm4,%%xmm1\n\t" \
     370                 :   "pmulhw %%xmm0,%%xmm4\n\t" \
     371                 :   "paddw %%xmm0,%%xmm4\n\t" \
     372                 :   "movdqa %%xmm7,%%xmm0\n\t" \
     373                 :   "movdqa %%xmm4,%%xmm5\n\t" \
     374                 :   "paddw %%xmm2,%%xmm0\n\t" \
     375                 :   "psubw %%xmm2,%%xmm7\n\t" \
     376                 :   "movdqa %%xmm1,%%xmm2\n\t" \
     377                 :   "pmulhw %%xmm6,%%xmm1\n\t" \
     378                 :   "pmulhw %%xmm7,%%xmm2\n\t" \
     379                 :   "paddw %%xmm6,%%xmm1\n\t" \
     380                 :   "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
     381                 :   "paddw %%xmm7,%%xmm2\n\t" \
     382                 :   "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
     383                 :   /*Stage 3: \
     384                 :     6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
     385                 :     0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
     386                 :     1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
     387                 :   "paddw %%xmm2,%%xmm1\n\t" \
     388                 :   "paddw %%xmm5,%%xmm6\n\t" \
     389                 :   "paddw %%xmm4,%%xmm7\n\t" \
     390                 :   "paddw %%xmm2,%%xmm2\n\t" \
     391                 :   "paddw %%xmm4,%%xmm4\n\t" \
     392                 :   "paddw %%xmm5,%%xmm5\n\t" \
     393                 :   "psubw %%xmm1,%%xmm2\n\t" \
     394                 :   "psubw %%xmm7,%%xmm4\n\t" \
     395                 :   "psubw %%xmm6,%%xmm5\n\t" \
     396                 : 
     397               0 : static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     398                 :   OC_ALIGN16(ogg_int16_t buf[16]);
     399                 :   /*This routine accepts an 8x8 matrix pre-transposed.*/
     400               0 :   __asm__ __volatile__(
     401                 :     "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
     402                 :     "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
     403                 :     "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
     404                 :     "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
     405                 :     OC_IDCT_8x8_10_MMX
     406                 :     OC_TRANSPOSE_8x4_MMX2SSE
     407                 :     OC_IDCT_8x8_10_ABC
     408                 :     OC_IDCT_8x8_D_STORE
     409               0 :     :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
     410               0 :      [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
     411                 :     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
     412               0 :      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
     413                 :   );
     414               0 :   if(_x!=_y){
     415                 :     /*Clear input data for next block (decoder only).*/
     416               0 :     __asm__ __volatile__(
     417                 :       "pxor %%mm0,%%mm0\n\t"
     418                 :       "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     419                 :       "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     420                 :       "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
     421                 :       "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
     422               0 :       :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
     423                 :     );
     424                 :   }
     425               0 : }
     426                 : 
     427                 : /*Performs an inverse 8x8 Type-II DCT transform.
     428                 :   The input is assumed to be scaled by a factor of 4 relative to orthonormal
     429                 :    version of the transform.*/
     430               0 : void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
     431                 :   /*_last_zzi is subtly different from an actual count of the number of
     432                 :      coefficients we decoded for this block.
     433                 :     It contains the value of zzi BEFORE the final token in the block was
     434                 :      decoded.
     435                 :     In most cases this is an EOB token (the continuation of an EOB run from a
     436                 :      previous block counts), and so this is the same as the coefficient count.
     437                 :     However, in the case that the last token was NOT an EOB token, but filled
     438                 :      the block up with exactly 64 coefficients, _last_zzi will be less than 64.
     439                 :     Provided the last token was not a pure zero run, the minimum value it can
     440                 :      be is 46, and so that doesn't affect any of the cases in this routine.
     441                 :     However, if the last token WAS a pure zero run of length 63, then _last_zzi
     442                 :      will be 1 while the number of coefficients decoded is 64.
     443                 :     Thus, we will trigger the following special case, where the real
     444                 :      coefficient count would not.
     445                 :     Note also that a zero run of length 64 will give _last_zzi a value of 0,
     446                 :      but we still process the DC coefficient, which might have a non-zero value
     447                 :      due to DC prediction.
     448                 :     Although convoluted, this is arguably the correct behavior: it allows us to
     449                 :      use a smaller transform when the block ends with a long zero run instead
     450                 :      of a normal EOB token.
     451                 :     It could be smarter... multiple separate zero runs at the end of a block
     452                 :      will fool it, but an encoder that generates these really deserves what it
     453                 :      gets.
     454                 :     Needless to say we inherited this approach from VP3.*/
     455                 :   /*Then perform the iDCT.*/
     456               0 :   if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
     457               0 :   else oc_idct8x8_slow_sse2(_y,_x);
     458               0 : }
     459                 : 
     460                 : #endif

Generated by: LCOV version 1.7