LCOV - code coverage report
Current view: directory - media/libtheora/lib/x86 - mmxidct.c (source / functions) Found Hit Coverage
Test: app.info Lines: 18 0 0.0 %
Date: 2012-06-02 Functions: 3 0 0.0 %

       1                 : /********************************************************************
       2                 :  *                                                                  *
       3                 :  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
       4                 :  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
       5                 :  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
       6                 :  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
       7                 :  *                                                                  *
       8                 :  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
       9                 :  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
      10                 :  *                                                                  *
      11                 :  ********************************************************************
      12                 : 
      13                 :   function:
      14                 :     last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
      15                 : 
      16                 :  ********************************************************************/
      17                 : 
      18                 : /*MMX acceleration of Theora's iDCT.
      19                 :   Originally written by Rudolf Marek, based on code from On2's VP3.*/
      20                 : #include "x86int.h"
      21                 : #include "../dct.h"
      22                 : 
      23                 : #if defined(OC_X86_ASM)
      24                 : 
      25                 : /*These are offsets into the table of constants below.*/
      26                 : /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
      27                 : #define OC_COSINE_OFFSET (0)
      28                 : /*A row of 8's.*/
      29                 : #define OC_EIGHT_OFFSET  (56)
      30                 : 
      31                 : 
      32                 : 
      33                 : /*38 cycles*/
      34                 : #define OC_IDCT_BEGIN(_y,_x) \
      35                 :   "#OC_IDCT_BEGIN\n\t" \
      36                 :   "movq "OC_I(3,_x)",%%mm2\n\t" \
      37                 :   "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
      38                 :   "movq %%mm2,%%mm4\n\t" \
      39                 :   "movq "OC_J(5,_x)",%%mm7\n\t" \
      40                 :   "pmulhw %%mm6,%%mm4\n\t" \
      41                 :   "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
      42                 :   "pmulhw %%mm7,%%mm6\n\t" \
      43                 :   "movq %%mm1,%%mm5\n\t" \
      44                 :   "pmulhw %%mm2,%%mm1\n\t" \
      45                 :   "movq "OC_I(1,_x)",%%mm3\n\t" \
      46                 :   "pmulhw %%mm7,%%mm5\n\t" \
      47                 :   "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
      48                 :   "paddw %%mm2,%%mm4\n\t" \
      49                 :   "paddw %%mm7,%%mm6\n\t" \
      50                 :   "paddw %%mm1,%%mm2\n\t" \
      51                 :   "movq "OC_J(7,_x)",%%mm1\n\t" \
      52                 :   "paddw %%mm5,%%mm7\n\t" \
      53                 :   "movq %%mm0,%%mm5\n\t" \
      54                 :   "pmulhw %%mm3,%%mm0\n\t" \
      55                 :   "paddw %%mm7,%%mm4\n\t" \
      56                 :   "pmulhw %%mm1,%%mm5\n\t" \
      57                 :   "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
      58                 :   "psubw %%mm2,%%mm6\n\t" \
      59                 :   "paddw %%mm3,%%mm0\n\t" \
      60                 :   "pmulhw %%mm7,%%mm3\n\t" \
      61                 :   "movq "OC_I(2,_x)",%%mm2\n\t" \
      62                 :   "pmulhw %%mm1,%%mm7\n\t" \
      63                 :   "paddw %%mm1,%%mm5\n\t" \
      64                 :   "movq %%mm2,%%mm1\n\t" \
      65                 :   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
      66                 :   "psubw %%mm5,%%mm3\n\t" \
      67                 :   "movq "OC_J(6,_x)",%%mm5\n\t" \
      68                 :   "paddw %%mm7,%%mm0\n\t" \
      69                 :   "movq %%mm5,%%mm7\n\t" \
      70                 :   "psubw %%mm4,%%mm0\n\t" \
      71                 :   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
      72                 :   "paddw %%mm1,%%mm2\n\t" \
      73                 :   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
      74                 :   "paddw %%mm4,%%mm4\n\t" \
      75                 :   "paddw %%mm0,%%mm4\n\t" \
      76                 :   "psubw %%mm6,%%mm3\n\t" \
      77                 :   "paddw %%mm7,%%mm5\n\t" \
      78                 :   "paddw %%mm6,%%mm6\n\t" \
      79                 :   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
      80                 :   "paddw %%mm3,%%mm6\n\t" \
      81                 :   "movq %%mm4,"OC_I(1,_y)"\n\t" \
      82                 :   "psubw %%mm5,%%mm1\n\t" \
      83                 :   "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
      84                 :   "movq %%mm3,%%mm5\n\t" \
      85                 :   "pmulhw %%mm4,%%mm3\n\t" \
      86                 :   "paddw %%mm2,%%mm7\n\t" \
      87                 :   "movq %%mm6,"OC_I(2,_y)"\n\t" \
      88                 :   "movq %%mm0,%%mm2\n\t" \
      89                 :   "movq "OC_I(0,_x)",%%mm6\n\t" \
      90                 :   "pmulhw %%mm4,%%mm0\n\t" \
      91                 :   "paddw %%mm3,%%mm5\n\t" \
      92                 :   "movq "OC_J(4,_x)",%%mm3\n\t" \
      93                 :   "psubw %%mm1,%%mm5\n\t" \
      94                 :   "paddw %%mm0,%%mm2\n\t" \
      95                 :   "psubw %%mm3,%%mm6\n\t" \
      96                 :   "movq %%mm6,%%mm0\n\t" \
      97                 :   "pmulhw %%mm4,%%mm6\n\t" \
      98                 :   "paddw %%mm3,%%mm3\n\t" \
      99                 :   "paddw %%mm1,%%mm1\n\t" \
     100                 :   "paddw %%mm0,%%mm3\n\t" \
     101                 :   "paddw %%mm5,%%mm1\n\t" \
     102                 :   "pmulhw %%mm3,%%mm4\n\t" \
     103                 :   "paddw %%mm0,%%mm6\n\t" \
     104                 :   "psubw %%mm2,%%mm6\n\t" \
     105                 :   "paddw %%mm2,%%mm2\n\t" \
     106                 :   "movq "OC_I(1,_y)",%%mm0\n\t" \
     107                 :   "paddw %%mm6,%%mm2\n\t" \
     108                 :   "paddw %%mm3,%%mm4\n\t" \
     109                 :   "psubw %%mm1,%%mm2\n\t" \
     110                 :   "#end OC_IDCT_BEGIN\n\t" \
     111                 : 
     112                 : /*38+8=46 cycles.*/
     113                 : #define OC_ROW_IDCT(_y,_x) \
     114                 :   "#OC_ROW_IDCT\n" \
     115                 :   OC_IDCT_BEGIN(_y,_x) \
     116                 :   /*r3=D'*/ \
     117                 :   "movq "OC_I(2,_y)",%%mm3\n\t" \
     118                 :   /*r4=E'=E-G*/ \
     119                 :   "psubw %%mm7,%%mm4\n\t" \
     120                 :   /*r1=H'+H'*/ \
     121                 :   "paddw %%mm1,%%mm1\n\t" \
     122                 :   /*r7=G+G*/ \
     123                 :   "paddw %%mm7,%%mm7\n\t" \
     124                 :   /*r1=R1=A''+H'*/ \
     125                 :   "paddw %%mm2,%%mm1\n\t" \
     126                 :   /*r7=G'=E+G*/ \
     127                 :   "paddw %%mm4,%%mm7\n\t" \
     128                 :   /*r4=R4=E'-D'*/ \
     129                 :   "psubw %%mm3,%%mm4\n\t" \
     130                 :   "paddw %%mm3,%%mm3\n\t" \
     131                 :   /*r6=R6=F'-B''*/ \
     132                 :   "psubw %%mm5,%%mm6\n\t" \
     133                 :   "paddw %%mm5,%%mm5\n\t" \
     134                 :   /*r3=R3=E'+D'*/ \
     135                 :   "paddw %%mm4,%%mm3\n\t" \
     136                 :   /*r5=R5=F'+B''*/ \
     137                 :   "paddw %%mm6,%%mm5\n\t" \
     138                 :   /*r7=R7=G'-C'*/ \
     139                 :   "psubw %%mm0,%%mm7\n\t" \
     140                 :   "paddw %%mm0,%%mm0\n\t" \
     141                 :   /*Save R1.*/ \
     142                 :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     143                 :   /*r0=R0=G.+C.*/ \
     144                 :   "paddw %%mm7,%%mm0\n\t" \
     145                 :   "#end OC_ROW_IDCT\n\t" \
     146                 : 
     147                 : /*The following macro does two 4x4 transposes in place.
     148                 :   At entry, we assume:
     149                 :     r0 = a3 a2 a1 a0
     150                 :   I(1) = b3 b2 b1 b0
     151                 :     r2 = c3 c2 c1 c0
     152                 :     r3 = d3 d2 d1 d0
     153                 : 
     154                 :     r4 = e3 e2 e1 e0
     155                 :     r5 = f3 f2 f1 f0
     156                 :     r6 = g3 g2 g1 g0
     157                 :     r7 = h3 h2 h1 h0
     158                 : 
     159                 :   At exit, we have:
     160                 :   I(0) = d0 c0 b0 a0
     161                 :   I(1) = d1 c1 b1 a1
     162                 :   I(2) = d2 c2 b2 a2
     163                 :   I(3) = d3 c3 b3 a3
     164                 : 
     165                 :   J(4) = h0 g0 f0 e0
     166                 :   J(5) = h1 g1 f1 e1
     167                 :   J(6) = h2 g2 f2 e2
     168                 :   J(7) = h3 g3 f3 e3
     169                 : 
     170                 :   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
     171                 :   J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
     172                 : 
     173                 :   Since r1 is free at entry, we calculate the Js first.*/
     174                 : /*19 cycles.*/
     175                 : #define OC_TRANSPOSE(_y) \
     176                 :   "#OC_TRANSPOSE\n\t" \
     177                 :   "movq %%mm4,%%mm1\n\t" \
     178                 :   "punpcklwd %%mm5,%%mm4\n\t" \
     179                 :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     180                 :   "punpckhwd %%mm5,%%mm1\n\t" \
     181                 :   "movq %%mm6,%%mm0\n\t" \
     182                 :   "punpcklwd %%mm7,%%mm6\n\t" \
     183                 :   "movq %%mm4,%%mm5\n\t" \
     184                 :   "punpckldq %%mm6,%%mm4\n\t" \
     185                 :   "punpckhdq %%mm6,%%mm5\n\t" \
     186                 :   "movq %%mm1,%%mm6\n\t" \
     187                 :   "movq %%mm4,"OC_J(4,_y)"\n\t" \
     188                 :   "punpckhwd %%mm7,%%mm0\n\t" \
     189                 :   "movq %%mm5,"OC_J(5,_y)"\n\t" \
     190                 :   "punpckhdq %%mm0,%%mm6\n\t" \
     191                 :   "movq "OC_I(0,_y)",%%mm4\n\t" \
     192                 :   "punpckldq %%mm0,%%mm1\n\t" \
     193                 :   "movq "OC_I(1,_y)",%%mm5\n\t" \
     194                 :   "movq %%mm4,%%mm0\n\t" \
     195                 :   "movq %%mm6,"OC_J(7,_y)"\n\t" \
     196                 :   "punpcklwd %%mm5,%%mm0\n\t" \
     197                 :   "movq %%mm1,"OC_J(6,_y)"\n\t" \
     198                 :   "punpckhwd %%mm5,%%mm4\n\t" \
     199                 :   "movq %%mm2,%%mm5\n\t" \
     200                 :   "punpcklwd %%mm3,%%mm2\n\t" \
     201                 :   "movq %%mm0,%%mm1\n\t" \
     202                 :   "punpckldq %%mm2,%%mm0\n\t" \
     203                 :   "punpckhdq %%mm2,%%mm1\n\t" \
     204                 :   "movq %%mm4,%%mm2\n\t" \
     205                 :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     206                 :   "punpckhwd %%mm3,%%mm5\n\t" \
     207                 :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     208                 :   "punpckhdq %%mm5,%%mm4\n\t" \
     209                 :   "punpckldq %%mm5,%%mm2\n\t" \
     210                 :   "movq %%mm4,"OC_I(3,_y)"\n\t" \
     211                 :   "movq %%mm2,"OC_I(2,_y)"\n\t" \
     212                 :   "#end OC_TRANSPOSE\n\t" \
     213                 : 
     214                 : /*38+19=57 cycles.*/
     215                 : #define OC_COLUMN_IDCT(_y) \
     216                 :   "#OC_COLUMN_IDCT\n" \
     217                 :   OC_IDCT_BEGIN(_y,_y) \
     218                 :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
     219                 :   /*r1=H'+H'*/ \
     220                 :   "paddw %%mm1,%%mm1\n\t" \
     221                 :   /*r1=R1=A''+H'*/ \
     222                 :   "paddw %%mm2,%%mm1\n\t" \
     223                 :   /*r2=NR2*/ \
     224                 :   "psraw $4,%%mm2\n\t" \
     225                 :   /*r4=E'=E-G*/ \
     226                 :   "psubw %%mm7,%%mm4\n\t" \
     227                 :   /*r1=NR1*/ \
     228                 :   "psraw $4,%%mm1\n\t" \
     229                 :   /*r3=D'*/ \
     230                 :   "movq "OC_I(2,_y)",%%mm3\n\t" \
     231                 :   /*r7=G+G*/ \
     232                 :   "paddw %%mm7,%%mm7\n\t" \
     233                 :   /*Store NR2 at I(2).*/ \
     234                 :   "movq %%mm2,"OC_I(2,_y)"\n\t" \
     235                 :   /*r7=G'=E+G*/ \
     236                 :   "paddw %%mm4,%%mm7\n\t" \
     237                 :   /*Store NR1 at I(1).*/ \
     238                 :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     239                 :   /*r4=R4=E'-D'*/ \
     240                 :   "psubw %%mm3,%%mm4\n\t" \
     241                 :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
     242                 :   /*r3=D'+D'*/ \
     243                 :   "paddw %%mm3,%%mm3\n\t" \
     244                 :   /*r3=R3=E'+D'*/ \
     245                 :   "paddw %%mm4,%%mm3\n\t" \
     246                 :   /*r4=NR4*/ \
     247                 :   "psraw $4,%%mm4\n\t" \
     248                 :   /*r6=R6=F'-B''*/ \
     249                 :   "psubw %%mm5,%%mm6\n\t" \
     250                 :   /*r3=NR3*/ \
     251                 :   "psraw $4,%%mm3\n\t" \
     252                 :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
     253                 :   /*r5=B''+B''*/ \
     254                 :   "paddw %%mm5,%%mm5\n\t" \
     255                 :   /*r5=R5=F'+B''*/ \
     256                 :   "paddw %%mm6,%%mm5\n\t" \
     257                 :   /*r6=NR6*/ \
     258                 :   "psraw $4,%%mm6\n\t" \
     259                 :   /*Store NR4 at J(4).*/ \
     260                 :   "movq %%mm4,"OC_J(4,_y)"\n\t" \
     261                 :   /*r5=NR5*/ \
     262                 :   "psraw $4,%%mm5\n\t" \
     263                 :   /*Store NR3 at I(3).*/ \
     264                 :   "movq %%mm3,"OC_I(3,_y)"\n\t" \
     265                 :   /*r7=R7=G'-C'*/ \
     266                 :   "psubw %%mm0,%%mm7\n\t" \
     267                 :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
     268                 :   /*r0=C'+C'*/ \
     269                 :   "paddw %%mm0,%%mm0\n\t" \
     270                 :   /*r0=R0=G'+C'*/ \
     271                 :   "paddw %%mm7,%%mm0\n\t" \
     272                 :   /*r7=NR7*/ \
     273                 :   "psraw $4,%%mm7\n\t" \
     274                 :   /*Store NR6 at J(6).*/ \
     275                 :   "movq %%mm6,"OC_J(6,_y)"\n\t" \
     276                 :   /*r0=NR0*/ \
     277                 :   "psraw $4,%%mm0\n\t" \
     278                 :   /*Store NR5 at J(5).*/ \
     279                 :   "movq %%mm5,"OC_J(5,_y)"\n\t" \
     280                 :   /*Store NR7 at J(7).*/ \
     281                 :   "movq %%mm7,"OC_J(7,_y)"\n\t" \
     282                 :   /*Store NR0 at I(0).*/ \
     283                 :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     284                 :   "#end OC_COLUMN_IDCT\n\t" \
     285                 : 
     286               0 : static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     287                 :   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     288                 :     Every 4x4 block is transposed.*/
     289               0 :   __asm__ __volatile__(
     290                 : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
     291                 : #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
     292               0 :     OC_ROW_IDCT(y,x)
     293                 :     OC_TRANSPOSE(y)
     294                 : #undef  OC_I
     295                 : #undef  OC_J
     296                 : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
     297                 : #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
     298                 :     OC_ROW_IDCT(y,x)
     299                 :     OC_TRANSPOSE(y)
     300                 : #undef  OC_I
     301                 : #undef  OC_J
     302                 : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
     303                 : #define OC_J(_k,_y)   OC_I(_k,_y)
     304                 :     OC_COLUMN_IDCT(y)
     305                 : #undef  OC_I
     306                 : #undef  OC_J
     307                 : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
     308                 : #define OC_J(_k,_y)   OC_I(_k,_y)
     309                 :     OC_COLUMN_IDCT(y)
     310                 : #undef  OC_I
     311                 : #undef  OC_J
     312                 :     :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
     313                 :     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
     314                 :      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
     315                 :   );
     316               0 :   if(_x!=_y){
     317                 :     int i;
     318               0 :     __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
     319               0 :     for(i=0;i<4;i++){
     320               0 :       __asm__ __volatile__(
     321                 :         "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     322                 :         "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
     323                 :         "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     324                 :         "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
     325                 :         :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
     326                 :       );
     327                 :     }
     328                 :   }
     329               0 : }
     330                 : 
     331                 : /*25 cycles.*/
     332                 : #define OC_IDCT_BEGIN_10(_y,_x) \
     333                 :  "#OC_IDCT_BEGIN_10\n\t" \
     334                 :  "movq "OC_I(3,_x)",%%mm2\n\t" \
     335                 :  "nop\n\t" \
     336                 :  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
     337                 :  "movq %%mm2,%%mm4\n\t" \
     338                 :  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
     339                 :  "pmulhw %%mm6,%%mm4\n\t" \
     340                 :  "movq "OC_I(1,_x)",%%mm3\n\t" \
     341                 :  "pmulhw %%mm2,%%mm1\n\t" \
     342                 :  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
     343                 :  "paddw %%mm2,%%mm4\n\t" \
     344                 :  "pxor %%mm6,%%mm6\n\t" \
     345                 :  "paddw %%mm1,%%mm2\n\t" \
     346                 :  "movq "OC_I(2,_x)",%%mm5\n\t" \
     347                 :  "pmulhw %%mm3,%%mm0\n\t" \
     348                 :  "movq %%mm5,%%mm1\n\t" \
     349                 :  "paddw %%mm3,%%mm0\n\t" \
     350                 :  "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
     351                 :  "psubw %%mm2,%%mm6\n\t" \
     352                 :  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
     353                 :  "psubw %%mm4,%%mm0\n\t" \
     354                 :  "movq "OC_I(2,_x)",%%mm7\n\t" \
     355                 :  "paddw %%mm4,%%mm4\n\t" \
     356                 :  "paddw %%mm5,%%mm7\n\t" \
     357                 :  "paddw %%mm0,%%mm4\n\t" \
     358                 :  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
     359                 :  "psubw %%mm6,%%mm3\n\t" \
     360                 :  "movq %%mm4,"OC_I(1,_y)"\n\t" \
     361                 :  "paddw %%mm6,%%mm6\n\t" \
     362                 :  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
     363                 :  "paddw %%mm3,%%mm6\n\t" \
     364                 :  "movq %%mm3,%%mm5\n\t" \
     365                 :  "pmulhw %%mm4,%%mm3\n\t" \
     366                 :  "movq %%mm6,"OC_I(2,_y)"\n\t" \
     367                 :  "movq %%mm0,%%mm2\n\t" \
     368                 :  "movq "OC_I(0,_x)",%%mm6\n\t" \
     369                 :  "pmulhw %%mm4,%%mm0\n\t" \
     370                 :  "paddw %%mm3,%%mm5\n\t" \
     371                 :  "paddw %%mm0,%%mm2\n\t" \
     372                 :  "psubw %%mm1,%%mm5\n\t" \
     373                 :  "pmulhw %%mm4,%%mm6\n\t" \
     374                 :  "paddw "OC_I(0,_x)",%%mm6\n\t" \
     375                 :  "paddw %%mm1,%%mm1\n\t" \
     376                 :  "movq %%mm6,%%mm4\n\t" \
     377                 :  "paddw %%mm5,%%mm1\n\t" \
     378                 :  "psubw %%mm2,%%mm6\n\t" \
     379                 :  "paddw %%mm2,%%mm2\n\t" \
     380                 :  "movq "OC_I(1,_y)",%%mm0\n\t" \
     381                 :  "paddw %%mm6,%%mm2\n\t" \
     382                 :  "psubw %%mm1,%%mm2\n\t" \
     383                 :  "nop\n\t" \
     384                 :  "#end OC_IDCT_BEGIN_10\n\t" \
     385                 : 
     386                 : /*25+8=33 cycles.*/
     387                 : #define OC_ROW_IDCT_10(_y,_x) \
     388                 :  "#OC_ROW_IDCT_10\n\t" \
     389                 :  OC_IDCT_BEGIN_10(_y,_x) \
     390                 :  /*r3=D'*/ \
     391                 :  "movq "OC_I(2,_y)",%%mm3\n\t" \
     392                 :  /*r4=E'=E-G*/ \
     393                 :  "psubw %%mm7,%%mm4\n\t" \
     394                 :  /*r1=H'+H'*/ \
     395                 :  "paddw %%mm1,%%mm1\n\t" \
     396                 :  /*r7=G+G*/ \
     397                 :  "paddw %%mm7,%%mm7\n\t" \
     398                 :  /*r1=R1=A''+H'*/ \
     399                 :  "paddw %%mm2,%%mm1\n\t" \
     400                 :  /*r7=G'=E+G*/ \
     401                 :  "paddw %%mm4,%%mm7\n\t" \
     402                 :  /*r4=R4=E'-D'*/ \
     403                 :  "psubw %%mm3,%%mm4\n\t" \
     404                 :  "paddw %%mm3,%%mm3\n\t" \
     405                 :  /*r6=R6=F'-B''*/ \
     406                 :  "psubw %%mm5,%%mm6\n\t" \
     407                 :  "paddw %%mm5,%%mm5\n\t" \
     408                 :  /*r3=R3=E'+D'*/ \
     409                 :  "paddw %%mm4,%%mm3\n\t" \
     410                 :  /*r5=R5=F'+B''*/ \
     411                 :  "paddw %%mm6,%%mm5\n\t" \
     412                 :  /*r7=R7=G'-C'*/ \
     413                 :  "psubw %%mm0,%%mm7\n\t" \
     414                 :  "paddw %%mm0,%%mm0\n\t" \
     415                 :  /*Save R1.*/ \
     416                 :  "movq %%mm1,"OC_I(1,_y)"\n\t" \
     417                 :  /*r0=R0=G'+C'*/ \
     418                 :  "paddw %%mm7,%%mm0\n\t" \
     419                 :  "#end OC_ROW_IDCT_10\n\t" \
     420                 : 
     421                 : /*25+19=44 cycles'*/
     422                 : #define OC_COLUMN_IDCT_10(_y) \
     423                 :  "#OC_COLUMN_IDCT_10\n\t" \
     424                 :  OC_IDCT_BEGIN_10(_y,_y) \
     425                 :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
     426                 :  /*r1=H'+H'*/ \
     427                 :  "paddw %%mm1,%%mm1\n\t" \
     428                 :  /*r1=R1=A''+H'*/ \
     429                 :  "paddw %%mm2,%%mm1\n\t" \
     430                 :  /*r2=NR2*/ \
     431                 :  "psraw $4,%%mm2\n\t" \
     432                 :  /*r4=E'=E-G*/ \
     433                 :  "psubw %%mm7,%%mm4\n\t" \
     434                 :  /*r1=NR1*/ \
     435                 :  "psraw $4,%%mm1\n\t" \
     436                 :  /*r3=D'*/ \
     437                 :  "movq "OC_I(2,_y)",%%mm3\n\t" \
     438                 :  /*r7=G+G*/ \
     439                 :  "paddw %%mm7,%%mm7\n\t" \
     440                 :  /*Store NR2 at I(2).*/ \
     441                 :  "movq %%mm2,"OC_I(2,_y)"\n\t" \
     442                 :  /*r7=G'=E+G*/ \
     443                 :  "paddw %%mm4,%%mm7\n\t" \
     444                 :  /*Store NR1 at I(1).*/ \
     445                 :  "movq %%mm1,"OC_I(1,_y)"\n\t" \
     446                 :  /*r4=R4=E'-D'*/ \
     447                 :  "psubw %%mm3,%%mm4\n\t" \
     448                 :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
     449                 :  /*r3=D'+D'*/ \
     450                 :  "paddw %%mm3,%%mm3\n\t" \
     451                 :  /*r3=R3=E'+D'*/ \
     452                 :  "paddw %%mm4,%%mm3\n\t" \
     453                 :  /*r4=NR4*/ \
     454                 :  "psraw $4,%%mm4\n\t" \
     455                 :  /*r6=R6=F'-B''*/ \
     456                 :  "psubw %%mm5,%%mm6\n\t" \
     457                 :  /*r3=NR3*/ \
     458                 :  "psraw $4,%%mm3\n\t" \
     459                 :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
     460                 :  /*r5=B''+B''*/ \
     461                 :  "paddw %%mm5,%%mm5\n\t" \
     462                 :  /*r5=R5=F'+B''*/ \
     463                 :  "paddw %%mm6,%%mm5\n\t" \
     464                 :  /*r6=NR6*/ \
     465                 :  "psraw $4,%%mm6\n\t" \
     466                 :  /*Store NR4 at J(4).*/ \
     467                 :  "movq %%mm4,"OC_J(4,_y)"\n\t" \
     468                 :  /*r5=NR5*/ \
     469                 :  "psraw $4,%%mm5\n\t" \
     470                 :  /*Store NR3 at I(3).*/ \
     471                 :  "movq %%mm3,"OC_I(3,_y)"\n\t" \
     472                 :  /*r7=R7=G'-C'*/ \
     473                 :  "psubw %%mm0,%%mm7\n\t" \
     474                 :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
     475                 :  /*r0=C'+C'*/ \
     476                 :  "paddw %%mm0,%%mm0\n\t" \
     477                 :  /*r0=R0=G'+C'*/ \
     478                 :  "paddw %%mm7,%%mm0\n\t" \
     479                 :  /*r7=NR7*/ \
     480                 :  "psraw $4,%%mm7\n\t" \
     481                 :  /*Store NR6 at J(6).*/ \
     482                 :  "movq %%mm6,"OC_J(6,_y)"\n\t" \
     483                 :  /*r0=NR0*/ \
     484                 :  "psraw $4,%%mm0\n\t" \
     485                 :  /*Store NR5 at J(5).*/ \
     486                 :  "movq %%mm5,"OC_J(5,_y)"\n\t" \
     487                 :  /*Store NR7 at J(7).*/ \
     488                 :  "movq %%mm7,"OC_J(7,_y)"\n\t" \
     489                 :  /*Store NR0 at I(0).*/ \
     490                 :  "movq %%mm0,"OC_I(0,_y)"\n\t" \
     491                 :  "#end OC_COLUMN_IDCT_10\n\t" \
     492                 : 
     493               0 : static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     494               0 :   __asm__ __volatile__(
     495                 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
     496                 : #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
     497                 :     /*Done with dequant, descramble, and partial transpose.
     498                 :       Now do the iDCT itself.*/
     499               0 :     OC_ROW_IDCT_10(y,x)
     500                 :     OC_TRANSPOSE(y)
     501                 : #undef  OC_I
     502                 : #undef  OC_J
     503                 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
     504                 : #define OC_J(_k,_y) OC_I(_k,_y)
     505                 :     OC_COLUMN_IDCT_10(y)
     506                 : #undef  OC_I
     507                 : #undef  OC_J
     508                 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
     509                 : #define OC_J(_k,_y) OC_I(_k,_y)
     510                 :     OC_COLUMN_IDCT_10(y)
     511                 : #undef  OC_I
     512                 : #undef  OC_J
     513                 :     :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
     514                 :     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
     515                 :      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
     516                 :   );
     517               0 :   if(_x!=_y){
     518               0 :     __asm__ __volatile__(
     519                 :       "pxor %%mm0,%%mm0\n\t"
     520                 :       "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     521                 :       "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     522                 :       "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
     523                 :       "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
     524                 :       :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
     525                 :     );
     526                 :   }
     527               0 : }
     528                 : 
     529                 : /*Performs an inverse 8x8 Type-II DCT transform.
     530                 :   The input is assumed to be scaled by a factor of 4 relative to orthonormal
     531                 :    version of the transform.*/
     532               0 : void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
     533                 :   /*_last_zzi is subtly different from an actual count of the number of
     534                 :      coefficients we decoded for this block.
     535                 :     It contains the value of zzi BEFORE the final token in the block was
     536                 :      decoded.
     537                 :     In most cases this is an EOB token (the continuation of an EOB run from a
     538                 :      previous block counts), and so this is the same as the coefficient count.
     539                 :     However, in the case that the last token was NOT an EOB token, but filled
     540                 :      the block up with exactly 64 coefficients, _last_zzi will be less than 64.
     541                 :     Provided the last token was not a pure zero run, the minimum value it can
     542                 :      be is 46, and so that doesn't affect any of the cases in this routine.
     543                 :     However, if the last token WAS a pure zero run of length 63, then _last_zzi
     544                 :      will be 1 while the number of coefficients decoded is 64.
     545                 :     Thus, we will trigger the following special case, where the real
     546                 :      coefficient count would not.
     547                 :     Note also that a zero run of length 64 will give _last_zzi a value of 0,
     548                 :      but we still process the DC coefficient, which might have a non-zero value
     549                 :      due to DC prediction.
     550                 :     Although convoluted, this is arguably the correct behavior: it allows us to
     551                 :      use a smaller transform when the block ends with a long zero run instead
     552                 :      of a normal EOB token.
     553                 :     It could be smarter... multiple separate zero runs at the end of a block
     554                 :      will fool it, but an encoder that generates these really deserves what it
     555                 :      gets.
     556                 :     Needless to say we inherited this approach from VP3.*/
     557                 :   /*Then perform the iDCT.*/
     558               0 :   if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
     559               0 :   else oc_idct8x8_slow_mmx(_y,_x);
     560               0 : }
     561                 : 
     562                 : #endif

Generated by: LCOV version 1.7