00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include <inttypes.h>
00039 #include <math.h>
00040 
00041 #include "config.h"
00042 
00043 #include "mp_msg.h"
00044 #include "cpudetect.h"
00045 #include "img_format.h"
00046 #include "mp_image.h"
00047 #include "vf.h"
00048 #include "vd_ffmpeg.h"
00049 #include "libvo/fastmemcpy.h"
00050 
00051 #include "libavutil/internal.h"
00052 #include "libavutil/intreadwrite.h"
00053 #include "libavutil/mem.h"
00054 #include "libavcodec/avcodec.h"
00055 #include "libavcodec/dsputil.h"
00056 
00057 #undef free
00058 #undef malloc
00059 
00060 
00061 #define BLOCKSZ 12
00062 
00063 static const short custom_threshold[64]=
00064 
00065 
00066 
00067 { 71, 296, 295, 237,  71,  40,  38,  19,
00068   245, 193, 185, 121, 102,  73,  53,  27,
00069   158, 129, 141, 107,  97,  73,  50,  26,
00070   102, 116, 109,  98,  82,  66,  45,  23,
00071   71,  94,  95,  81,  70,  56,  38,  20,
00072   56,  77,  74,  66,  56,  44,  30,  15,
00073   38,  53,  50,  45,  38,  30,  21,  11,
00074   20,  27,  26,  23,  20,  15,  11,   5
00075 };
00076 
00077 static const uint8_t  __attribute__((aligned(32))) dither[8][8]={
00078     {  0,  48,  12,  60,   3,  51,  15,  63, },
00079     { 32,  16,  44,  28,  35,  19,  47,  31, },
00080     {  8,  56,   4,  52,  11,  59,   7,  55, },
00081     { 40,  24,  36,  20,  43,  27,  39,  23, },
00082     {  2,  50,  14,  62,   1,  49,  13,  61, },
00083     { 34,  18,  46,  30,  33,  17,  45,  29, },
00084     { 10,  58,   6,  54,   9,  57,   5,  53, },
00085     { 42,  26,  38,  22,  41,  25,  37,  21, },
00086 };
00087 
00088 struct vf_priv_s { 
00089     uint64_t threshold_mtx_noq[8*2];
00090     uint64_t threshold_mtx[8*2];
00091 
00092     int log2_count;
00093     int temp_stride;
00094     int qp;
00095     int mpeg2;
00096     int prev_q;
00097     uint8_t *src;
00098     int16_t *temp;
00099     int bframes;
00100     char *non_b_qp;
00101 };
00102 
00103 
00104 #if !HAVE_MMX
00105 
00106 
00107 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00108 {int y, x;
00109 #define STORE(pos)                                                        \
00110     temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
00111     src[x + pos]=src[x + pos - 8*src_stride]=0;                                \
00112     if(temp & 0x100) temp= ~(temp>>31);                                        \
00113     dst[x + pos]= temp;
00114 
00115     for(y=0; y<height; y++){
00116         const uint8_t *d= dither[y];
00117         for(x=0; x<width; x+=8){
00118             int temp;
00119             STORE(0);
00120             STORE(1);
00121             STORE(2);
00122             STORE(3);
00123             STORE(4);
00124             STORE(5);
00125             STORE(6);
00126             STORE(7);
00127         }
00128         src+=src_stride;
00129         dst+=dst_stride;
00130     }
00131 }
00132 
00133 
00134 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00135 {int y, x;
00136 #define STORE2(pos)                                                        \
00137     temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
00138     src[x + pos + 16*src_stride]=0;                                        \
00139     if(temp & 0x100) temp= ~(temp>>31);                                        \
00140     dst[x + pos]= temp;
00141 
00142     for(y=0; y<height; y++){
00143         const uint8_t *d= dither[y];
00144         for(x=0; x<width; x+=8){
00145             int temp;
00146             STORE2(0);
00147             STORE2(1);
00148             STORE2(2);
00149             STORE2(3);
00150             STORE2(4);
00151             STORE2(5);
00152             STORE2(6);
00153             STORE2(7);
00154         }
00155         src+=src_stride;
00156         dst+=dst_stride;
00157     }
00158 }
00159 
00160 static void mul_thrmat_c(struct vf_priv_s *p,int q)
00161 {
00162     int a;
00163     for(a=0;a<64;a++)
00164         ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];
00165 }
00166 
00167 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
00168 static void row_idct_c(DCTELEM* workspace,
00169                        int16_t* output_adr, int output_stride, int cnt);
00170 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
00171 
00172 
00173 #define store_slice_s store_slice_c
00174 #define store_slice2_s store_slice2_c
00175 #define mul_thrmat_s mul_thrmat_c
00176 #define column_fidct_s column_fidct_c
00177 #define row_idct_s row_idct_c
00178 #define row_fdct_s row_fdct_c
00179 
00180 #else 
00181 
00182 
00183 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00184 {
00185     const uint8_t *od=&dither[0][0];
00186     const uint8_t *end=&dither[height][0];
00187     width = (width+7)&~7;
00188     dst_stride-=width;
00189     
00190     __asm__ volatile(
00191         "mov %5, %%"REG_d"                \n\t"
00192         "mov %6, %%"REG_S"                \n\t"
00193         "mov %7, %%"REG_D"                \n\t"
00194         "mov %1, %%"REG_a"                \n\t"
00195         "movd %%"REG_d", %%mm5             \n\t"
00196         "xor $-1, %%"REG_d"              \n\t"
00197         "mov %%"REG_a", %%"REG_c"             \n\t"
00198         "add $7, %%"REG_d"               \n\t"
00199         "neg %%"REG_a"                   \n\t"
00200         "sub %0, %%"REG_c"            \n\t"
00201         "add %%"REG_c", %%"REG_c"             \n\t"
00202         "movd %%"REG_d", %%mm2             \n\t"
00203         "mov %%"REG_c", %1       \n\t"
00204         "mov %2, %%"REG_d"               \n\t"
00205         "shl $4, %%"REG_a"               \n\t"
00206 
00207         "2:                        \n\t"
00208         "movq (%%"REG_d"), %%mm3           \n\t"
00209         "movq %%mm3, %%mm4             \n\t"
00210         "pxor %%mm7, %%mm7             \n\t"
00211         "punpcklbw %%mm7, %%mm3        \n\t"
00212         "punpckhbw %%mm7, %%mm4        \n\t"
00213         "mov %0, %%"REG_c"            \n\t"
00214         "psraw %%mm5, %%mm3            \n\t"
00215         "psraw %%mm5, %%mm4            \n\t"
00216         "1:                        \n\t"
00217         "movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
00218         "movq (%%"REG_S"), %%mm0           \n\t"
00219         "movq 8(%%"REG_S"), %%mm1          \n\t"
00220 
00221         "movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
00222         "paddw %%mm3, %%mm0            \n\t"
00223         "paddw %%mm4, %%mm1            \n\t"
00224 
00225         "movq %%mm7, (%%"REG_S")           \n\t"
00226         "psraw %%mm2, %%mm0            \n\t"
00227         "psraw %%mm2, %%mm1            \n\t"
00228 
00229         "movq %%mm7, 8(%%"REG_S")          \n\t"
00230         "packuswb %%mm1, %%mm0         \n\t"
00231         "add $16, %%"REG_S"              \n\t"
00232 
00233         "movq %%mm0, (%%"REG_D")           \n\t"
00234         "add $8, %%"REG_D"               \n\t"
00235         "sub $8, %%"REG_c"               \n\t"
00236         "jg 1b                      \n\t"
00237         "add %1, %%"REG_S"       \n\t"
00238         "add $8, %%"REG_d"               \n\t"
00239         "add %3, %%"REG_D"       \n\t"
00240         "cmp %4, %%"REG_d"           \n\t"
00241         "jl 2b                      \n\t"
00242 
00243         :
00244         : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00245           "m" (log2_scale), "m" (src), "m" (dst) 
00246         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
00247         );
00248 }
00249 
00250 
00251 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00252 {
00253     const uint8_t *od=&dither[0][0];
00254     const uint8_t *end=&dither[height][0];
00255     width = (width+7)&~7;
00256     dst_stride-=width;
00257     
00258     __asm__ volatile(
00259         "mov %5, %%"REG_d"                \n\t"
00260         "mov %6, %%"REG_S"                \n\t"
00261         "mov %7, %%"REG_D"                \n\t"
00262         "mov %1, %%"REG_a"            \n\t"
00263         "movd %%"REG_d", %%mm5             \n\t"
00264         "xor $-1, %%"REG_d"              \n\t"
00265         "mov %%"REG_a", %%"REG_c"             \n\t"
00266         "add $7, %%"REG_d"               \n\t"
00267         "sub %0, %%"REG_c"            \n\t"
00268         "add %%"REG_c", %%"REG_c"             \n\t"
00269         "movd %%"REG_d", %%mm2             \n\t"
00270         "mov %%"REG_c", %1       \n\t"
00271         "mov %2, %%"REG_d"               \n\t"
00272         "shl $5, %%"REG_a"               \n\t"
00273 
00274         "2:                        \n\t"
00275         "movq (%%"REG_d"), %%mm3           \n\t"
00276         "movq %%mm3, %%mm4             \n\t"
00277         "pxor %%mm7, %%mm7             \n\t"
00278         "punpcklbw %%mm7, %%mm3        \n\t"
00279         "punpckhbw %%mm7, %%mm4        \n\t"
00280         "mov %0, %%"REG_c"            \n\t"
00281         "psraw %%mm5, %%mm3            \n\t"
00282         "psraw %%mm5, %%mm4            \n\t"
00283         "1:                        \n\t"
00284         "movq (%%"REG_S"), %%mm0           \n\t"
00285         "movq 8(%%"REG_S"), %%mm1          \n\t"
00286         "paddw %%mm3, %%mm0            \n\t"
00287 
00288         "paddw (%%"REG_S",%%"REG_a",), %%mm0    \n\t"
00289         "paddw %%mm4, %%mm1            \n\t"
00290         "movq 8(%%"REG_S",%%"REG_a",), %%mm6    \n\t"
00291 
00292         "movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
00293         "psraw %%mm2, %%mm0            \n\t"
00294         "paddw %%mm6, %%mm1            \n\t"
00295 
00296         "movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
00297         "psraw %%mm2, %%mm1            \n\t"
00298         "packuswb %%mm1, %%mm0         \n\t"
00299 
00300         "movq %%mm0, (%%"REG_D")           \n\t"
00301         "add $16, %%"REG_S"              \n\t"
00302         "add $8, %%"REG_D"               \n\t"
00303         "sub $8, %%"REG_c"               \n\t"
00304         "jg 1b                      \n\t"
00305         "add %1, %%"REG_S"       \n\t"
00306         "add $8, %%"REG_d"               \n\t"
00307         "add %3, %%"REG_D"       \n\t"
00308         "cmp %4, %%"REG_d"           \n\t"
00309         "jl 2b                      \n\t"
00310 
00311         :
00312         : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00313           "m" (log2_scale), "m" (src), "m" (dst) 
00314         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
00315         );
00316 }
00317 
00318 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
00319 {
00320     uint64_t *adr=&p->threshold_mtx_noq[0];
00321     __asm__ volatile(
00322         "movd %0, %%mm7                \n\t"
00323         "add $8*8*2, %%"REG_D"            \n\t"
00324         "movq 0*8(%%"REG_S"), %%mm0        \n\t"
00325         "punpcklwd %%mm7, %%mm7        \n\t"
00326         "movq 1*8(%%"REG_S"), %%mm1        \n\t"
00327         "punpckldq %%mm7, %%mm7        \n\t"
00328         "pmullw %%mm7, %%mm0           \n\t"
00329 
00330         "movq 2*8(%%"REG_S"), %%mm2        \n\t"
00331         "pmullw %%mm7, %%mm1           \n\t"
00332 
00333         "movq 3*8(%%"REG_S"), %%mm3        \n\t"
00334         "pmullw %%mm7, %%mm2           \n\t"
00335 
00336         "movq %%mm0, 0*8(%%"REG_D")        \n\t"
00337         "movq 4*8(%%"REG_S"), %%mm4        \n\t"
00338         "pmullw %%mm7, %%mm3           \n\t"
00339 
00340         "movq %%mm1, 1*8(%%"REG_D")        \n\t"
00341         "movq 5*8(%%"REG_S"), %%mm5        \n\t"
00342         "pmullw %%mm7, %%mm4           \n\t"
00343 
00344         "movq %%mm2, 2*8(%%"REG_D")        \n\t"
00345         "movq 6*8(%%"REG_S"), %%mm6        \n\t"
00346         "pmullw %%mm7, %%mm5           \n\t"
00347 
00348         "movq %%mm3, 3*8(%%"REG_D")        \n\t"
00349         "movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"
00350         "pmullw %%mm7, %%mm6           \n\t"
00351 
00352         "movq %%mm4, 4*8(%%"REG_D")        \n\t"
00353         "movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"
00354         "pmullw %%mm7, %%mm0           \n\t"
00355 
00356         "movq %%mm5, 5*8(%%"REG_D")        \n\t"
00357         "movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"
00358         "pmullw %%mm7, %%mm1           \n\t"
00359 
00360         "movq %%mm6, 6*8(%%"REG_D")        \n\t"
00361         "movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"
00362         "pmullw %%mm7, %%mm2           \n\t"
00363 
00364         "movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"
00365         "movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"
00366         "pmullw %%mm7, %%mm3           \n\t"
00367 
00368         "movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"
00369         "movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"
00370         "pmullw %%mm7, %%mm4           \n\t"
00371 
00372         "movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"
00373         "movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"
00374         "pmullw %%mm7, %%mm5           \n\t"
00375 
00376         "movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"
00377         "movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"
00378         "pmullw %%mm7, %%mm6           \n\t"
00379 
00380         "movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"
00381         "movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"
00382         "pmullw %%mm7, %%mm0           \n\t"
00383 
00384         "movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"
00385         "pmullw %%mm7, %%mm1           \n\t"
00386 
00387         "movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"
00388         "movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"
00389         "movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"
00390 
00391         : "+g" (q), "+S" (adr), "+D" (adr)
00392         :
00393         );
00394 }
00395 
00396 static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt);
00397 static void row_idct_mmx(DCTELEM* workspace,
00398                          int16_t* output_adr,  int output_stride,  int cnt);
00399 static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt);
00400 
00401 #define store_slice_s store_slice_mmx
00402 #define store_slice2_s store_slice2_mmx
00403 #define mul_thrmat_s mul_thrmat_mmx
00404 #define column_fidct_s column_fidct_mmx
00405 #define row_idct_s row_idct_mmx
00406 #define row_fdct_s row_fdct_mmx
00407 #endif // HAVE_MMX
00408 
00409 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
00410                    int dst_stride, int src_stride,
00411                    int width, int height,
00412                    uint8_t *qp_store, int qp_stride, int is_luma)
00413 {
00414     int x, x0, y, es, qy, t;
00415     const int stride= is_luma ? p->temp_stride : (width+16);
00416     const int step=6-p->log2_count;
00417     const int qps= 3 + is_luma;
00418     int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
00419     DCTELEM *block= (DCTELEM *)block_align;
00420     DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
00421 
00422     memset(block3, 0, 4*8*BLOCKSZ);
00423 
00424     
00425     if (!src || !dst) return; 
00426     for(y=0; y<height; y++){
00427         int index= 8 + 8*stride + y*stride;
00428         fast_memcpy(p->src + index, src + y*src_stride, width);
00429         for(x=0; x<8; x++){
00430             p->src[index         - x - 1]= p->src[index +         x    ];
00431             p->src[index + width + x    ]= p->src[index + width - x - 1];
00432         }
00433     }
00434     for(y=0; y<8; y++){
00435         fast_memcpy(p->src + (      7-y)*stride, p->src + (      y+8)*stride, stride);
00436         fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
00437     }
00438     
00439 
00440     for(y=8; y<24; y++)
00441         memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
00442 
00443     for(y=step; y<height+8; y+=step){    
00444         qy=y-4;
00445         if (qy>height-1) qy=height-1;
00446         if (qy<0) qy=0;
00447         qy=(qy>>qps)*qp_stride;
00448         row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
00449         for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
00450             row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
00451             if(p->qp)
00452                 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); 
00453             else
00454                 for (x=0; x<8*(BLOCKSZ-1); x+=8) {
00455                     t=x+x0-2; 
00456                     if (t<0) t=0;
00457                     t=qp_store[qy+(t>>qps)];
00458                     t=norm_qscale(t, p->mpeg2);
00459                     if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
00460                     column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); 
00461                 }
00462             row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
00463             memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); 
00464             memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
00465         }
00466         
00467         es=width+8-x0; 
00468         if (es>8)
00469             row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
00470         column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
00471         row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
00472         {const int y1=y-8+step;
00473             if (!(y1&7) && y1) {
00474                 if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
00475                                         dst_stride, stride, width, 8, 5-p->log2_count);
00476                 else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
00477                                     dst_stride, stride, width, 8, 5-p->log2_count);
00478             } }
00479     }
00480 
00481     if (y&7) {  
00482         if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
00483                                dst_stride, stride, width, y&7, 5-p->log2_count);
00484         else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
00485                             dst_stride, stride, width, y&7, 5-p->log2_count);
00486     }
00487 }
00488 
00489 static int config(struct vf_instance *vf,
00490                   int width, int height, int d_width, int d_height,
00491                   unsigned int flags, unsigned int outfmt)
00492 {
00493     int h= (height+16+15)&(~15);
00494 
00495     vf->priv->temp_stride= (width+16+15)&(~15);
00496     vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
00497     
00498     vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
00499 
00500     return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
00501 }
00502 
00503 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
00504 {
00505     if(mpi->flags&MP_IMGFLAG_PRESERVE) return; 
00506     
00507     vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
00508                           mpi->type, mpi->flags, mpi->width, mpi->height);
00509     mpi->planes[0]=vf->dmpi->planes[0];
00510     mpi->stride[0]=vf->dmpi->stride[0];
00511     mpi->width=vf->dmpi->width;
00512     if(mpi->flags&MP_IMGFLAG_PLANAR){
00513         mpi->planes[1]=vf->dmpi->planes[1];
00514         mpi->planes[2]=vf->dmpi->planes[2];
00515         mpi->stride[1]=vf->dmpi->stride[1];
00516         mpi->stride[2]=vf->dmpi->stride[2];
00517     }
00518     mpi->flags|=MP_IMGFLAG_DIRECT;
00519 }
00520 
00521 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
00522 {
00523     mp_image_t *dmpi;
00524     if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
00525         
00526         dmpi=vf_get_image(vf->next,mpi->imgfmt,
00527                           MP_IMGTYPE_TEMP,
00528                           MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
00529                           mpi->width,mpi->height);
00530         vf_clone_mpi_attributes(dmpi, mpi);
00531     }else{
00532         dmpi=vf->dmpi;
00533     }
00534 
00535     vf->priv->mpeg2= mpi->qscale_type;
00536     if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
00537         int w = mpi->qstride;
00538         int h = (mpi->h + 15) >> 4;
00539         if (!w) {
00540             w = (mpi->w + 15) >> 4;
00541             h = 1;
00542         }
00543         if(!vf->priv->non_b_qp)
00544             vf->priv->non_b_qp= malloc(w*h);
00545         fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
00546     }
00547     if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
00548         char *qp_tab= vf->priv->non_b_qp;
00549         if(vf->priv->bframes || !qp_tab)
00550             qp_tab= mpi->qscale;
00551 
00552         if(qp_tab || vf->priv->qp){
00553             filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
00554                    mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
00555             filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
00556                    mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00557             filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
00558                    mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00559         }else{
00560             memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
00561             memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
00562             memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
00563         }
00564     }
00565 
00566 #if HAVE_MMX
00567     if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
00568 #endif
00569 #if HAVE_MMX2
00570     if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
00571 #endif
00572     return vf_next_put_image(vf,dmpi, pts);
00573 }
00574 
00575 static void uninit(struct vf_instance *vf)
00576 {
00577     if(!vf->priv) return;
00578 
00579     av_free(vf->priv->temp);
00580     vf->priv->temp= NULL;
00581     av_free(vf->priv->src);
00582     vf->priv->src= NULL;
00583     
00584     
00585     free(vf->priv->non_b_qp);
00586     vf->priv->non_b_qp= NULL;
00587 
00588     av_free(vf->priv);
00589     vf->priv=NULL;
00590 }
00591 
00592 
00593 
00594 static int query_format(struct vf_instance *vf, unsigned int fmt)
00595 {
00596     switch(fmt){
00597     case IMGFMT_YVU9:
00598     case IMGFMT_IF09:
00599     case IMGFMT_YV12:
00600     case IMGFMT_I420:
00601     case IMGFMT_IYUV:
00602     case IMGFMT_CLPL:
00603     case IMGFMT_Y800:
00604     case IMGFMT_Y8:
00605     case IMGFMT_444P:
00606     case IMGFMT_422P:
00607     case IMGFMT_411P:
00608         return vf_next_query_format(vf,fmt);
00609     }
00610     return 0;
00611 }
00612 
00613 static int control(struct vf_instance *vf, int request, void* data)
00614 {
00615     switch(request){
00616     case VFCTRL_QUERY_MAX_PP_LEVEL:
00617         return 5;
00618     case VFCTRL_SET_PP_LEVEL:
00619         vf->priv->log2_count= *((unsigned int*)data);
00620         if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
00621         return CONTROL_TRUE;
00622     }
00623     return vf_next_control(vf,request,data);
00624 }
00625 
00626 static int vf_open(vf_instance_t *vf, char *args)
00627 {
00628     int i=0, bias;
00629     int custom_threshold_m[64];
00630     int log2c=-1;
00631 
00632     vf->config=config;
00633     vf->put_image=put_image;
00634     vf->get_image=get_image;
00635     vf->query_format=query_format;
00636     vf->uninit=uninit;
00637     vf->control= control;
00638     vf->priv=av_mallocz(sizeof(struct vf_priv_s));
00639 
00640     init_avcodec();
00641 
00642     
00643     
00644 
00645     vf->priv->log2_count= 4;
00646     vf->priv->bframes = 0;
00647 
00648     if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
00649 
00650     if( log2c >=4 && log2c <=5 )
00651         vf->priv->log2_count = log2c;
00652     else if( log2c >= 6 )
00653         vf->priv->log2_count = 5;
00654 
00655     if(vf->priv->qp < 0)
00656         vf->priv->qp = 0;
00657 
00658     if (i < -15) i = -15;
00659     if (i > 32) i = 32;
00660 
00661     bias= (1<<4)+i; 
00662     vf->priv->prev_q=0;
00663     
00664     for(i=0;i<64;i++) 
00665         custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
00666     for(i=0;i<8;i++){
00667         vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
00668             |(((uint64_t)custom_threshold_m[i*8+6])<<16)
00669             |(((uint64_t)custom_threshold_m[i*8+0])<<32)
00670             |(((uint64_t)custom_threshold_m[i*8+4])<<48);
00671         vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
00672             |(((uint64_t)custom_threshold_m[i*8+3])<<16)
00673             |(((uint64_t)custom_threshold_m[i*8+1])<<32)
00674             |(((uint64_t)custom_threshold_m[i*8+7])<<48);
00675     }
00676 
00677     if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
00678 
00679     return 1;
00680 }
00681 
00682 const vf_info_t vf_info_fspp = {
00683     "fast simple postprocess",
00684     "fspp",
00685     "Michael Niedermayer, Nikolaj Poroshin",
00686     "",
00687     vf_open,
00688     NULL
00689 };
00690 
00691 
00692 
00693 
00694 
00695 
00696 
00697 
00698 
00699 #define DCTSIZE 8
00700 #define DCTSIZE_S "8"
00701 
00702 #define FIX(x,s)  ((int) ((x) * (1<<s) + 0.5)&0xffff)
00703 #define C64(x)    ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
00704 #define FIX64(x,s)  C64(FIX(x,s))
00705 
00706 #define MULTIPLY16H(x,k)   (((x)*(k))>>16)
00707 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
00708 #define DESCALE(x,n)  (((x) + (1 << ((n)-1))) >> n)
00709 
00710 #if HAVE_MMX
00711 
00712 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
00713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
00714 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
00715 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
00716 
00717 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
00718 
00719 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
00720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); 
00721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
00722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
00723 
00724 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
00725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
00726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
00727 
00728 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
00729 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
00730 
00731 #else 
00732 
00733 typedef int32_t int_simd16_t;
00734 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
00735 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
00736 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
00737 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
00738 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
00739 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
00740 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); 
00741 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
00742 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
00743 
00744 #endif
00745 
00746 #if !HAVE_MMX
00747 
00748 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
00749 {
00750     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
00751     int_simd16_t tmp10, tmp11, tmp12, tmp13;
00752     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
00753     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
00754 
00755     DCTELEM* dataptr;
00756     DCTELEM* wsptr;
00757     int16_t *threshold;
00758     int ctr;
00759 
00760     dataptr = data;
00761     wsptr = output;
00762 
00763     for (; cnt > 0; cnt-=2) { 
00764         threshold=(int16_t*)thr_adr;
00765         for (ctr = DCTSIZE; ctr > 0; ctr--) {
00766             
00767             tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
00768             tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
00769 
00770             tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
00771             tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
00772 
00773             tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
00774             tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
00775 
00776             tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
00777             tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
00778 
00779             
00780 
00781             tmp10 = tmp0 + tmp3;
00782             tmp13 = tmp0 - tmp3;
00783             tmp11 = tmp1 + tmp2;
00784             tmp12 = tmp1 - tmp2;
00785 
00786             d0 = tmp10 + tmp11;
00787             d4 = tmp10 - tmp11;
00788 
00789             z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
00790             d2 = tmp13 + z1;
00791             d6 = tmp13 - z1;
00792 
00793             
00794 
00795             THRESHOLD(tmp0, d0, threshold[0*8]);
00796             THRESHOLD(tmp1, d2, threshold[2*8]);
00797             THRESHOLD(tmp2, d4, threshold[4*8]);
00798             THRESHOLD(tmp3, d6, threshold[6*8]);
00799             tmp0+=2;
00800             tmp10 = (tmp0 + tmp2)>>2;
00801             tmp11 = (tmp0 - tmp2)>>2;
00802 
00803             tmp13 = (tmp1 + tmp3)>>2; 
00804             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; 
00805 
00806             tmp0 = tmp10 + tmp13; 
00807             tmp3 = tmp10 - tmp13; 
00808             tmp1 = tmp11 + tmp12; 
00809             tmp2 = tmp11 - tmp12; 
00810 
00811             
00812 
00813             tmp10 = tmp4 + tmp5;
00814             tmp11 = tmp5 + tmp6;
00815             tmp12 = tmp6 + tmp7;
00816 
00817             z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
00818             z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
00819             z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
00820             z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
00821 
00822             z11 = tmp7 + z3;
00823             z13 = tmp7 - z3;
00824 
00825             d5 = z13 + z2;
00826             d3 = z13 - z2;
00827             d1 = z11 + z4;
00828             d7 = z11 - z4;
00829 
00830             
00831 
00832             THRESHOLD(tmp4, d1, threshold[1*8]);
00833             THRESHOLD(tmp5, d3, threshold[3*8]);
00834             THRESHOLD(tmp6, d5, threshold[5*8]);
00835             THRESHOLD(tmp7, d7, threshold[7*8]);
00836 
00837             
00838             z13 = tmp6 + tmp5;
00839             z10 = (tmp6 - tmp5)<<1;
00840             z11 = tmp4 + tmp7;
00841             z12 = (tmp4 - tmp7)<<1;
00842 
00843             tmp7 = (z11 + z13)>>2; 
00844             tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
00845             z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
00846             tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
00847             tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; 
00848 
00849             tmp6 = tmp12 - tmp7;
00850             tmp5 = tmp11 - tmp6;
00851             tmp4 = tmp10 + tmp5;
00852 
00853             wsptr[DCTSIZE*0]+=  (tmp0 + tmp7);
00854             wsptr[DCTSIZE*1]+=  (tmp1 + tmp6);
00855             wsptr[DCTSIZE*2]+=  (tmp2 + tmp5);
00856             wsptr[DCTSIZE*3]+=  (tmp3 - tmp4);
00857             wsptr[DCTSIZE*4]+=  (tmp3 + tmp4);
00858             wsptr[DCTSIZE*5]+=  (tmp2 - tmp5);
00859             wsptr[DCTSIZE*6]=  (tmp1 - tmp6);
00860             wsptr[DCTSIZE*7]=  (tmp0 - tmp7);
00861             
00862             dataptr++; 
00863             wsptr++;
00864             threshold++;
00865         }
00866         dataptr+=8; 
00867         wsptr  +=8;
00868     }
00869 }
00870 
00871 #else 
00872 
00873 static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt)
00874 {
00875     uint64_t __attribute__((aligned(8))) temps[4];
00876     __asm__ volatile(
00877         ASMALIGN(4)
00878         "1:                   \n\t"
00879         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
00880         
00881         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
00882         "movq %%mm1, %%mm0             \n\t"
00883 
00884         "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" 
00885         "movq %%mm7, %%mm3             \n\t"
00886 
00887         "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" 
00888         "movq %%mm1, %%mm5             \n\t"
00889 
00890         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
00891         "psubw %%mm7, %%mm1            \n\t" 
00892 
00893         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00894         "movq %%mm6, %%mm4             \n\t"
00895 
00896         "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" 
00897         "paddw %%mm7, %%mm5            \n\t" 
00898 
00899         "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" 
00900         "movq %%mm6, %%mm7             \n\t"
00901 
00902         "paddw %%mm2, %%mm6            \n\t" 
00903         "psubw %%mm2, %%mm7            \n\t" 
00904 
00905         "movq %%mm5, %%mm2             \n\t"
00906         "paddw %%mm6, %%mm5            \n\t" 
00907         
00908         "psubw %%mm6, %%mm2            \n\t" 
00909         "paddw %%mm1, %%mm7            \n\t"
00910 
00911         "movq  4*16(%%"REG_d"), %%mm6      \n\t"
00912         "psllw $2, %%mm7              \n\t"
00913 
00914         "psubw 0*16(%%"REG_d"), %%mm5      \n\t"
00915         "psubw %%mm6, %%mm2            \n\t"
00916 
00917         "paddusw 0*16(%%"REG_d"), %%mm5    \n\t"
00918         "paddusw %%mm6, %%mm2          \n\t"
00919 
00920         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
00921         
00922         "paddw 0*16(%%"REG_d"), %%mm5      \n\t"
00923         "paddw %%mm6, %%mm2            \n\t"
00924 
00925         "psubusw 0*16(%%"REG_d"), %%mm5    \n\t"
00926         "psubusw %%mm6, %%mm2          \n\t"
00927 
00928 
00929 
00930 
00931         "paddw "MANGLE(MM_2)", %%mm5            \n\t"
00932         "movq %%mm2, %%mm6             \n\t"
00933 
00934         "paddw %%mm5, %%mm2            \n\t"
00935         "psubw %%mm6, %%mm5            \n\t"
00936 
00937         "movq %%mm1, %%mm6             \n\t"
00938         "paddw %%mm7, %%mm1            \n\t" 
00939 
00940         "psubw 2*16(%%"REG_d"), %%mm1      \n\t"
00941         "psubw %%mm7, %%mm6            \n\t" 
00942 
00943         "movq 6*16(%%"REG_d"), %%mm7       \n\t"
00944         "psraw $2, %%mm5              \n\t"
00945 
00946         "paddusw 2*16(%%"REG_d"), %%mm1    \n\t"
00947         "psubw %%mm7, %%mm6            \n\t"
00948         
00949 
00950         "paddw 2*16(%%"REG_d"), %%mm1      \n\t"
00951         "paddusw %%mm7, %%mm6          \n\t"
00952 
00953         "psubusw 2*16(%%"REG_d"), %%mm1    \n\t"
00954         "paddw %%mm7, %%mm6            \n\t"
00955 
00956         "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
00957         "psubusw %%mm7, %%mm6          \n\t"
00958 
00959         
00960         
00961         "movq %%mm1, %%mm7             \n\t"
00962         "psraw $2, %%mm2              \n\t"
00963 
00964         "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
00965         "psubw %%mm6, %%mm1            \n\t"
00966 
00967         "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
00968         "paddw %%mm7, %%mm6            \n\t" 
00969 
00970         "psraw $2, %%mm6              \n\t" 
00971         "movq %%mm2, %%mm7             \n\t"
00972 
00973         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
00974         "paddw %%mm6, %%mm2            \n\t" 
00975 
00976         "movq %%mm2, 0*8+%3            \n\t" 
00977         "psubw %%mm6, %%mm7            \n\t" 
00978 
00979         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00980         "psubw %%mm6, %%mm1            \n\t" 
00981 
00982         "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" 
00983         "movq %%mm5, %%mm6             \n\t"
00984 
00985         "movq %%mm7, 3*8+%3            \n\t"
00986         "paddw %%mm2, %%mm3            \n\t" 
00987 
00988         "paddw %%mm4, %%mm2            \n\t" 
00989         "paddw %%mm0, %%mm4            \n\t" 
00990 
00991         "movq %%mm3, %%mm7             \n\t"
00992         "psubw %%mm4, %%mm3            \n\t"
00993 
00994         "psllw $2, %%mm3              \n\t"
00995         "psllw $2, %%mm7              \n\t" 
00996 
00997         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
00998         "psllw $2, %%mm4              \n\t"
00999 
01000         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01001         "psllw $2, %%mm2              \n\t"
01002 
01003         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01004         "paddw %%mm1, %%mm5            \n\t" 
01005 
01006         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01007         "psubw %%mm1, %%mm6            \n\t" 
01008         
01009 
01010         "paddw %%mm3, %%mm7            \n\t" 
01011 
01012         "movq %%mm5, 1*8+%3            \n\t"
01013         "paddw %%mm3, %%mm4            \n\t" 
01014 
01015         "movq 3*16(%%"REG_d"), %%mm3       \n\t"
01016         "movq %%mm0, %%mm1             \n\t"
01017 
01018         "movq %%mm6, 2*8+%3            \n\t"
01019         "psubw %%mm2, %%mm1            \n\t" 
01020 
01021 
01022         "paddw %%mm2, %%mm0            \n\t" 
01023         "movq %%mm1, %%mm5             \n\t"
01024 
01025         "movq 5*16(%%"REG_d"), %%mm2       \n\t"
01026         "psubw %%mm7, %%mm1            \n\t" 
01027 
01028         "paddw %%mm7, %%mm5            \n\t" 
01029         "psubw %%mm3, %%mm1            \n\t"
01030 
01031         "movq 1*16(%%"REG_d"), %%mm7       \n\t"
01032         "psubw %%mm2, %%mm5            \n\t"
01033 
01034         "movq %%mm0, %%mm6             \n\t"
01035         "paddw %%mm4, %%mm0            \n\t" 
01036 
01037         "paddusw %%mm3, %%mm1          \n\t"
01038         "psubw %%mm4, %%mm6            \n\t" 
01039 
01040         
01041         "movq 7*16(%%"REG_d"), %%mm4       \n\t"
01042         "psubw %%mm7, %%mm0            \n\t"
01043 
01044         "psubw %%mm4, %%mm6            \n\t"
01045         "paddusw %%mm2, %%mm5          \n\t"
01046 
01047         "paddusw %%mm4, %%mm6          \n\t"
01048         "paddw %%mm3, %%mm1            \n\t"
01049 
01050         "paddw %%mm2, %%mm5            \n\t"
01051         "paddw %%mm4, %%mm6            \n\t"
01052 
01053         "psubusw %%mm3, %%mm1          \n\t"
01054         "psubusw %%mm2, %%mm5          \n\t"
01055 
01056         "psubusw %%mm4, %%mm6          \n\t"
01057         "movq %%mm1, %%mm4             \n\t"
01058 
01059         "por %%mm5, %%mm4              \n\t"
01060         "paddusw %%mm7, %%mm0          \n\t"
01061 
01062         "por %%mm6, %%mm4              \n\t"
01063         "paddw %%mm7, %%mm0            \n\t"
01064 
01065         "packssdw %%mm4, %%mm4         \n\t"
01066         "psubusw %%mm7, %%mm0          \n\t"
01067 
01068         "movd %%mm4, %%"REG_a"             \n\t"
01069         "or %%"REG_a", %%"REG_a"              \n\t"
01070         "jnz 2f                 \n\t"
01071         
01072         
01073         
01074         
01075         
01076         
01077 
01078         "movq 0*8+%3, %%mm4            \n\t"
01079         "movq %%mm0, %%mm1             \n\t"
01080 
01081         "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" 
01082         "movq %%mm1, %%mm2             \n\t"
01083 
01084         "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01085         "movq %%mm2, %%mm3             \n\t"
01086 
01087         "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" 
01088         "paddw %%mm4, %%mm5            \n\t"
01089 
01090         "movq 1*8+%3, %%mm6            \n\t"
01091         
01092         "psraw $2, %%mm3              \n\t" 
01093 
01094         "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" 
01095         "psubw %%mm3, %%mm4            \n\t"
01096 
01097         "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01098         "paddw %%mm3, %%mm5            \n\t"
01099 
01100         "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01101         "paddw %%mm6, %%mm7            \n\t"
01102 
01103         "movq 2*8+%3, %%mm3            \n\t"
01104         "psubw %%mm0, %%mm6            \n\t"
01105 
01106         "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01107         "paddw %%mm0, %%mm7            \n\t"
01108 
01109         "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01110         "paddw %%mm3, %%mm4            \n\t"
01111 
01112         "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01113         "psubw %%mm1, %%mm3            \n\t"
01114 
01115         "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01116         "paddw %%mm1, %%mm4            \n\t"
01117 
01118         "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01119         "paddw %%mm3, %%mm5            \n\t"
01120 
01121         "movq 3*8+%3, %%mm0            \n\t"
01122         "add $8, %%"REG_S"               \n\t"
01123 
01124         "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01125         "paddw %%mm0, %%mm6            \n\t"
01126 
01127         "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01128         "psubw %%mm2, %%mm0            \n\t"
01129 
01130         "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01131         "paddw %%mm2, %%mm6            \n\t"
01132 
01133         "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01134         "paddw %%mm0, %%mm7            \n\t"
01135 
01136         "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01137 
01138         "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01139         "add $8, %%"REG_D"               \n\t"
01140         "jmp 4f                  \n\t"
01141 
01142         "2:                    \n\t"
01143         
01144         
01145         
01146         
01147         
01148         "movq %%mm5, %%mm3             \n\t"
01149         "psubw %%mm1, %%mm5            \n\t"
01150 
01151         "psllw $1, %%mm5              \n\t" 
01152         "paddw %%mm1, %%mm3            \n\t" 
01153 
01154         "movq %%mm0, %%mm2             \n\t"
01155         "psubw %%mm6, %%mm0            \n\t"
01156 
01157         "movq %%mm5, %%mm1             \n\t"
01158         "psllw $1, %%mm0              \n\t" 
01159 
01160         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" 
01161         "paddw %%mm0, %%mm5            \n\t"
01162 
01163         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" 
01164         "paddw %%mm6, %%mm2            \n\t" 
01165 
01166         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01167         "movq %%mm2, %%mm7             \n\t"
01168 
01169         
01170         "movq 0*8+%3, %%mm4            \n\t"
01171         "psubw %%mm3, %%mm2            \n\t"
01172 
01173         "psllw $1, %%mm2              \n\t"
01174         "paddw %%mm3, %%mm7            \n\t" 
01175 
01176         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" 
01177         "movq %%mm4, %%mm6             \n\t"
01178         
01179         "psraw $2, %%mm7              \n\t"
01180 
01181         "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01182         "psubw %%mm7, %%mm6            \n\t"
01183 
01184         "movq 1*8+%3, %%mm3            \n\t"
01185         "paddw %%mm7, %%mm4            \n\t"
01186 
01187         "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01188         "paddw %%mm5, %%mm1            \n\t" 
01189 
01190         "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01191         "psubw %%mm7, %%mm1            \n\t" 
01192 
01193         "movq 2*8+%3, %%mm7            \n\t"
01194         "psubw %%mm5, %%mm0            \n\t" 
01195 
01196         "movq 3*8+%3, %%mm6            \n\t"
01197         "movq %%mm3, %%mm5             \n\t"
01198 
01199         "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01200         "psubw %%mm1, %%mm5            \n\t"
01201 
01202         "psubw %%mm1, %%mm2            \n\t" 
01203         "paddw %%mm1, %%mm3            \n\t"
01204 
01205         "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01206         "movq %%mm7, %%mm4             \n\t"
01207 
01208         "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01209         "psubw %%mm2, %%mm4            \n\t"
01210 
01211         "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01212         "paddw %%mm2, %%mm7            \n\t"
01213 
01214         "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01215         "paddw %%mm2, %%mm0            \n\t" 
01216 
01217         
01218         "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01219         "movq %%mm6, %%mm1             \n\t"
01220 
01221         "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01222         "psubw %%mm0, %%mm1            \n\t"
01223 
01224         "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01225         "paddw %%mm0, %%mm6            \n\t"
01226 
01227         "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01228         "add $8, %%"REG_S"               \n\t"
01229 
01230         "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01231 
01232         "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01233         "add $8, %%"REG_D"               \n\t"
01234 
01235         "4:                     \n\t"
01236 
01237         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
01238         
01239         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
01240         "movq %%mm1, %%mm0             \n\t"
01241 
01242         "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" 
01243         "movq %%mm7, %%mm3             \n\t"
01244 
01245         "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" 
01246         "movq %%mm1, %%mm5             \n\t"
01247 
01248         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
01249         "psubw %%mm7, %%mm1            \n\t" 
01250 
01251         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01252         "movq %%mm6, %%mm4             \n\t"
01253 
01254         "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" 
01255         "paddw %%mm7, %%mm5            \n\t" 
01256 
01257         "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" 
01258         "movq %%mm6, %%mm7             \n\t"
01259 
01260         "paddw %%mm2, %%mm6            \n\t" 
01261         "psubw %%mm2, %%mm7            \n\t" 
01262 
01263         "movq %%mm5, %%mm2             \n\t"
01264         "paddw %%mm6, %%mm5            \n\t" 
01265         
01266         "psubw %%mm6, %%mm2            \n\t" 
01267         "paddw %%mm1, %%mm7            \n\t"
01268 
01269         "movq  1*8+4*16(%%"REG_d"), %%mm6  \n\t"
01270         "psllw $2, %%mm7              \n\t"
01271 
01272         "psubw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
01273         "psubw %%mm6, %%mm2            \n\t"
01274 
01275         "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01276         "paddusw %%mm6, %%mm2          \n\t"
01277 
01278         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
01279         
01280         "paddw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
01281         "paddw %%mm6, %%mm2            \n\t"
01282 
01283         "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01284         "psubusw %%mm6, %%mm2          \n\t"
01285 
01286 
01287 
01288 
01289         "paddw "MANGLE(MM_2)", %%mm5            \n\t"
01290         "movq %%mm2, %%mm6             \n\t"
01291 
01292         "paddw %%mm5, %%mm2            \n\t"
01293         "psubw %%mm6, %%mm5            \n\t"
01294 
01295         "movq %%mm1, %%mm6             \n\t"
01296         "paddw %%mm7, %%mm1            \n\t" 
01297 
01298         "psubw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
01299         "psubw %%mm7, %%mm6            \n\t" 
01300 
01301         "movq 1*8+6*16(%%"REG_d"), %%mm7   \n\t"
01302         "psraw $2, %%mm5              \n\t"
01303 
01304         "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01305         "psubw %%mm7, %%mm6            \n\t"
01306         
01307 
01308         "paddw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
01309         "paddusw %%mm7, %%mm6          \n\t"
01310 
01311         "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01312         "paddw %%mm7, %%mm6            \n\t"
01313 
01314         "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
01315         "psubusw %%mm7, %%mm6          \n\t"
01316 
01317         
01318         
01319         "movq %%mm1, %%mm7             \n\t"
01320         "psraw $2, %%mm2              \n\t"
01321 
01322         "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
01323         "psubw %%mm6, %%mm1            \n\t"
01324 
01325         "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
01326         "paddw %%mm7, %%mm6            \n\t" 
01327 
01328         "psraw $2, %%mm6              \n\t" 
01329         "movq %%mm2, %%mm7             \n\t"
01330 
01331         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
01332         "paddw %%mm6, %%mm2            \n\t" 
01333 
01334         "movq %%mm2, 0*8+%3            \n\t" 
01335         "psubw %%mm6, %%mm7            \n\t" 
01336 
01337         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01338         "psubw %%mm6, %%mm1            \n\t" 
01339 
01340         "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" 
01341         "movq %%mm5, %%mm6             \n\t"
01342 
01343         "movq %%mm7, 3*8+%3            \n\t"
01344         "paddw %%mm2, %%mm3            \n\t" 
01345 
01346         "paddw %%mm4, %%mm2            \n\t" 
01347         "paddw %%mm0, %%mm4            \n\t" 
01348 
01349         "movq %%mm3, %%mm7             \n\t"
01350         "psubw %%mm4, %%mm3            \n\t"
01351 
01352         "psllw $2, %%mm3              \n\t"
01353         "psllw $2, %%mm7              \n\t" 
01354 
01355         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
01356         "psllw $2, %%mm4              \n\t"
01357 
01358         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01359         "psllw $2, %%mm2              \n\t"
01360 
01361         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01362         "paddw %%mm1, %%mm5            \n\t" 
01363 
01364         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01365         "psubw %%mm1, %%mm6            \n\t" 
01366         
01367 
01368         "paddw %%mm3, %%mm7            \n\t" 
01369 
01370         "movq %%mm5, 1*8+%3            \n\t"
01371         "paddw %%mm3, %%mm4            \n\t" 
01372 
01373         "movq 1*8+3*16(%%"REG_d"), %%mm3   \n\t"
01374         "movq %%mm0, %%mm1             \n\t"
01375 
01376         "movq %%mm6, 2*8+%3            \n\t"
01377         "psubw %%mm2, %%mm1            \n\t" 
01378 
01379 
01380         "paddw %%mm2, %%mm0            \n\t" 
01381         "movq %%mm1, %%mm5             \n\t"
01382 
01383         "movq 1*8+5*16(%%"REG_d"), %%mm2   \n\t"
01384         "psubw %%mm7, %%mm1            \n\t" 
01385 
01386         "paddw %%mm7, %%mm5            \n\t" 
01387         "psubw %%mm3, %%mm1            \n\t"
01388 
01389         "movq 1*8+1*16(%%"REG_d"), %%mm7   \n\t"
01390         "psubw %%mm2, %%mm5            \n\t"
01391 
01392         "movq %%mm0, %%mm6             \n\t"
01393         "paddw %%mm4, %%mm0            \n\t" 
01394 
01395         "paddusw %%mm3, %%mm1          \n\t"
01396         "psubw %%mm4, %%mm6            \n\t" 
01397 
01398         
01399         "movq 1*8+7*16(%%"REG_d"), %%mm4   \n\t"
01400         "psubw %%mm7, %%mm0            \n\t"
01401 
01402         "psubw %%mm4, %%mm6            \n\t"
01403         "paddusw %%mm2, %%mm5          \n\t"
01404 
01405         "paddusw %%mm4, %%mm6          \n\t"
01406         "paddw %%mm3, %%mm1            \n\t"
01407 
01408         "paddw %%mm2, %%mm5            \n\t"
01409         "paddw %%mm4, %%mm6            \n\t"
01410 
01411         "psubusw %%mm3, %%mm1          \n\t"
01412         "psubusw %%mm2, %%mm5          \n\t"
01413 
01414         "psubusw %%mm4, %%mm6          \n\t"
01415         "movq %%mm1, %%mm4             \n\t"
01416 
01417         "por %%mm5, %%mm4              \n\t"
01418         "paddusw %%mm7, %%mm0          \n\t"
01419 
01420         "por %%mm6, %%mm4              \n\t"
01421         "paddw %%mm7, %%mm0            \n\t"
01422 
01423         "packssdw %%mm4, %%mm4         \n\t"
01424         "psubusw %%mm7, %%mm0          \n\t"
01425 
01426         "movd %%mm4, %%"REG_a"             \n\t"
01427         "or %%"REG_a", %%"REG_a"              \n\t"
01428         "jnz 3f                 \n\t"
01429         
01430         
01431         
01432         
01433         
01434         
01435 
01436         "movq 0*8+%3, %%mm4            \n\t"
01437         "movq %%mm0, %%mm1             \n\t"
01438 
01439         "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" 
01440         "movq %%mm1, %%mm2             \n\t"
01441 
01442         "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01443         "movq %%mm2, %%mm3             \n\t"
01444 
01445         "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" 
01446         "paddw %%mm4, %%mm5            \n\t"
01447 
01448         "movq 1*8+%3, %%mm6            \n\t"
01449         
01450         "psraw $2, %%mm3              \n\t" 
01451 
01452         "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" 
01453         "psubw %%mm3, %%mm4            \n\t"
01454 
01455         "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01456         "paddw %%mm3, %%mm5            \n\t"
01457 
01458         "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01459         "paddw %%mm6, %%mm7            \n\t"
01460 
01461         "movq 2*8+%3, %%mm3            \n\t"
01462         "psubw %%mm0, %%mm6            \n\t"
01463 
01464         "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01465         "paddw %%mm0, %%mm7            \n\t"
01466 
01467         "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01468         "paddw %%mm3, %%mm4            \n\t"
01469 
01470         "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01471         "psubw %%mm1, %%mm3            \n\t"
01472 
01473         "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01474         "paddw %%mm1, %%mm4            \n\t"
01475 
01476         "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01477         "paddw %%mm3, %%mm5            \n\t"
01478 
01479         "movq 3*8+%3, %%mm0            \n\t"
01480         "add $24, %%"REG_S"              \n\t"
01481 
01482         "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01483         "paddw %%mm0, %%mm6            \n\t"
01484 
01485         "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01486         "psubw %%mm2, %%mm0            \n\t"
01487 
01488         "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01489         "paddw %%mm2, %%mm6            \n\t"
01490 
01491         "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01492         "paddw %%mm0, %%mm7            \n\t"
01493 
01494         "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01495 
01496         "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01497         "add $24, %%"REG_D"              \n\t"
01498         "sub $2, %%"REG_c"               \n\t"
01499         "jnz 1b                \n\t"
01500         "jmp 5f                   \n\t"
01501 
01502         "3:                    \n\t"
01503         
01504         
01505         
01506         
01507         
01508         "movq %%mm5, %%mm3             \n\t"
01509         "psubw %%mm1, %%mm5            \n\t"
01510 
01511         "psllw $1, %%mm5              \n\t" 
01512         "paddw %%mm1, %%mm3            \n\t" 
01513 
01514         "movq %%mm0, %%mm2             \n\t"
01515         "psubw %%mm6, %%mm0            \n\t"
01516 
01517         "movq %%mm5, %%mm1             \n\t"
01518         "psllw $1, %%mm0              \n\t" 
01519 
01520         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" 
01521         "paddw %%mm0, %%mm5            \n\t"
01522 
01523         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" 
01524         "paddw %%mm6, %%mm2            \n\t" 
01525 
01526         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01527         "movq %%mm2, %%mm7             \n\t"
01528 
01529         
01530         "movq 0*8+%3, %%mm4            \n\t"
01531         "psubw %%mm3, %%mm2            \n\t"
01532 
01533         "psllw $1, %%mm2              \n\t"
01534         "paddw %%mm3, %%mm7            \n\t" 
01535 
01536         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" 
01537         "movq %%mm4, %%mm6             \n\t"
01538         
01539         "psraw $2, %%mm7              \n\t"
01540 
01541         "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01542         "psubw %%mm7, %%mm6            \n\t"
01543 
01544         "movq 1*8+%3, %%mm3            \n\t"
01545         "paddw %%mm7, %%mm4            \n\t"
01546 
01547         "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01548         "paddw %%mm5, %%mm1            \n\t" 
01549 
01550         "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01551         "psubw %%mm7, %%mm1            \n\t" 
01552 
01553         "movq 2*8+%3, %%mm7            \n\t"
01554         "psubw %%mm5, %%mm0            \n\t" 
01555 
01556         "movq 3*8+%3, %%mm6            \n\t"
01557         "movq %%mm3, %%mm5             \n\t"
01558 
01559         "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01560         "psubw %%mm1, %%mm5            \n\t"
01561 
01562         "psubw %%mm1, %%mm2            \n\t" 
01563         "paddw %%mm1, %%mm3            \n\t"
01564 
01565         "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01566         "movq %%mm7, %%mm4             \n\t"
01567 
01568         "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01569         "psubw %%mm2, %%mm4            \n\t"
01570 
01571         "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01572         "paddw %%mm2, %%mm7            \n\t"
01573 
01574         "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01575         "paddw %%mm2, %%mm0            \n\t" 
01576 
01577         
01578         "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01579         "movq %%mm6, %%mm1             \n\t"
01580 
01581         "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01582         "psubw %%mm0, %%mm1            \n\t"
01583 
01584         "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01585         "paddw %%mm0, %%mm6            \n\t"
01586 
01587         "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01588         "add $24, %%"REG_S"              \n\t"
01589 
01590         "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01591 
01592         "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01593         "add $24, %%"REG_D"              \n\t"
01594         "sub $2, %%"REG_c"               \n\t"
01595         "jnz 1b                \n\t"
01596         "5:                      \n\t"
01597 
01598         : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
01599         : "d"(thr_adr)
01600         : "%"REG_a
01601         );
01602 }
01603 
01604 #endif // HAVE_MMX
01605 
01606 #if !HAVE_MMX
01607 
01608 static void row_idct_c(DCTELEM* workspace,
01609                        int16_t* output_adr, int output_stride, int cnt)
01610 {
01611     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01612     int_simd16_t tmp10, tmp11, tmp12, tmp13;
01613     int_simd16_t z5, z10, z11, z12, z13;
01614     int16_t* outptr;
01615     DCTELEM* wsptr;
01616 
01617     cnt*=4;
01618     wsptr = workspace;
01619     outptr = output_adr;
01620     for (; cnt > 0; cnt--) {
01621         
01622         
01623         tmp10 = ( wsptr[2] +  wsptr[3]);
01624         tmp11 = ( wsptr[2] -  wsptr[3]);
01625 
01626         tmp13 = ( wsptr[0] +  wsptr[1]);
01627         tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;
01628 
01629         tmp0 = tmp10 + tmp13; 
01630         tmp3 = tmp10 - tmp13; 
01631         tmp1 = tmp11 + tmp12;
01632         tmp2 = tmp11 - tmp12;
01633 
01634         
01635         
01636         
01637         
01638         
01639         
01640         z13 = wsptr[4] + wsptr[5];
01641         z10 = wsptr[4] - wsptr[5];
01642         z11 = wsptr[6] + wsptr[7];
01643         z12 = wsptr[6] - wsptr[7];
01644 
01645         tmp7 = z11 + z13;
01646         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
01647 
01648         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
01649         tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
01650         tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; 
01651 
01652         tmp6 = (tmp12<<3) - tmp7;
01653         tmp5 = (tmp11<<3) - tmp6;
01654         tmp4 = (tmp10<<3) + tmp5;
01655 
01656         
01657         outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
01658         outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
01659         outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
01660         outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
01661         outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
01662         outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
01663         outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); 
01664         outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); 
01665         outptr++;
01666 
01667         wsptr += DCTSIZE;       
01668     }
01669 }
01670 
01671 #else 
01672 
01673 static void row_idct_mmx (DCTELEM* workspace,
01674                           int16_t* output_adr,  int output_stride,  int cnt)
01675 {
01676     uint64_t __attribute__((aligned(8))) temps[4];
01677     __asm__ volatile(
01678         "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
01679         "1:                     \n\t"
01680         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
01681         
01682 
01683         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
01684         "movq %%mm0, %%mm4             \n\t"
01685 
01686         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01687         "punpcklwd %%mm1, %%mm0        \n\t"
01688 
01689         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
01690         "punpckhwd %%mm1, %%mm4        \n\t"
01691 
01692         
01693         "movq %%mm2, %%mm7             \n\t"
01694         "punpcklwd %%mm3, %%mm2        \n\t"
01695 
01696         "movq %%mm0, %%mm6             \n\t"
01697         "punpckldq %%mm2, %%mm0        \n\t" 
01698 
01699         "punpckhdq %%mm2, %%mm6        \n\t" 
01700         "movq %%mm0, %%mm5             \n\t"
01701 
01702         "punpckhwd %%mm3, %%mm7        \n\t"
01703         "psubw %%mm6, %%mm0            \n\t"
01704 
01705         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
01706         "movq %%mm4, %%mm2             \n\t"
01707 
01708         "punpckldq %%mm7, %%mm4        \n\t" 
01709         "paddw %%mm6, %%mm5            \n\t"
01710 
01711         "punpckhdq %%mm7, %%mm2        \n\t" 
01712         "movq %%mm4, %%mm1             \n\t"
01713 
01714         "psllw $2, %%mm0              \n\t"
01715         "paddw %%mm2, %%mm4            \n\t" 
01716 
01717         "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
01718         "psubw %%mm2, %%mm1            \n\t" 
01719 
01720         "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
01721         "psubw %%mm5, %%mm0            \n\t"
01722 
01723         "movq %%mm4, %%mm6             \n\t"
01724         "paddw %%mm5, %%mm4            \n\t" 
01725 
01726         "psubw %%mm5, %%mm6            \n\t" 
01727         "movq %%mm1, %%mm7             \n\t"
01728 
01729         "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
01730         "paddw %%mm0, %%mm1            \n\t" 
01731 
01732         "movq %%mm4, 0*8+%3            \n\t" 
01733         "movq %%mm3, %%mm4             \n\t"
01734 
01735         "movq %%mm6, 1*8+%3            \n\t" 
01736         "punpcklwd %%mm2, %%mm3        \n\t"
01737 
01738         
01739         "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
01740         "punpckhwd %%mm2, %%mm4        \n\t"
01741 
01742         "movq %%mm5, %%mm2             \n\t"
01743         "punpcklwd %%mm6, %%mm5        \n\t"
01744 
01745         "psubw %%mm0, %%mm7            \n\t" 
01746         "punpckhwd %%mm6, %%mm2        \n\t"
01747 
01748         "movq %%mm3, %%mm0             \n\t"
01749         "punpckldq %%mm5, %%mm3        \n\t" 
01750 
01751         "punpckhdq %%mm5, %%mm0        \n\t" 
01752         "movq %%mm4, %%mm5             \n\t"
01753 
01754         
01755         "movq %%mm3, %%mm6             \n\t"
01756         "punpckldq %%mm2, %%mm4        \n\t" 
01757 
01758         "psubw %%mm0, %%mm3            \n\t" 
01759         "punpckhdq %%mm2, %%mm5        \n\t" 
01760 
01761         "paddw %%mm0, %%mm6            \n\t" 
01762         "movq %%mm4, %%mm2             \n\t"
01763 
01764         "movq %%mm3, %%mm0             \n\t"
01765         "psubw %%mm5, %%mm4            \n\t" 
01766 
01767         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" 
01768         "paddw %%mm4, %%mm3            \n\t"
01769 
01770         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" 
01771         "paddw %%mm5, %%mm2            \n\t" 
01772 
01773         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
01774         "movq %%mm2, %%mm5             \n\t"
01775 
01776         "psubw %%mm6, %%mm2            \n\t"
01777         "paddw %%mm6, %%mm5            \n\t" 
01778 
01779         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" 
01780         "paddw %%mm3, %%mm0            \n\t" 
01781 
01782         "psllw $3, %%mm0              \n\t"
01783         "psubw %%mm3, %%mm4            \n\t" 
01784 
01785         "movq 0*8+%3, %%mm6            \n\t"
01786         "movq %%mm1, %%mm3             \n\t"
01787 
01788         "psllw $3, %%mm4              \n\t"
01789         "psubw %%mm5, %%mm0            \n\t" 
01790 
01791         "psllw $3, %%mm2              \n\t"
01792         "paddw %%mm0, %%mm1            \n\t" 
01793 
01794         "psubw %%mm0, %%mm2            \n\t" 
01795         "psubw %%mm0, %%mm3            \n\t" 
01796 
01797         "paddw %%mm2, %%mm4            \n\t" 
01798         "movq %%mm7, %%mm0             \n\t"
01799 
01800         "paddw %%mm2, %%mm7            \n\t" 
01801         "psubw %%mm2, %%mm0            \n\t" 
01802 
01803         "movq "MANGLE(MM_DESCALE_RND)", %%mm2   \n\t" 
01804         "psubw %%mm5, %%mm6            \n\t" 
01805 
01806         "paddw 0*8+%3, %%mm5           \n\t" 
01807         "paddw %%mm2, %%mm1            \n\t"
01808 
01809         "paddw %%mm2, %%mm5            \n\t"
01810         "psraw $3, %%mm1              \n\t"
01811 
01812         "paddw %%mm2, %%mm7            \n\t"
01813         "psraw $3, %%mm5              \n\t"
01814 
01815         "paddw (%%"REG_D"), %%mm5          \n\t"
01816         "psraw $3, %%mm7              \n\t"
01817 
01818         "paddw (%%"REG_D",%%"REG_a",), %%mm1    \n\t"
01819         "paddw %%mm2, %%mm0            \n\t"
01820 
01821         "paddw (%%"REG_D",%%"REG_a",2), %%mm7   \n\t"
01822         "paddw %%mm2, %%mm3            \n\t"
01823 
01824         "movq %%mm5, (%%"REG_D")           \n\t"
01825         "paddw %%mm2, %%mm6            \n\t"
01826 
01827         "movq %%mm1, (%%"REG_D",%%"REG_a",)     \n\t"
01828         "psraw $3, %%mm0              \n\t"
01829 
01830         "movq %%mm7, (%%"REG_D",%%"REG_a",2)    \n\t"
01831         "add %%"REG_d", %%"REG_D"             \n\t" 
01832 
01833         "movq 1*8+%3, %%mm5           \n\t" 
01834         "psraw $3, %%mm3              \n\t"
01835 
01836         "paddw (%%"REG_D",%%"REG_a",2), %%mm0   \n\t"
01837         "psubw %%mm4, %%mm5            \n\t" 
01838 
01839         "paddw (%%"REG_D",%%"REG_d",), %%mm3    \n\t"
01840         "psraw $3, %%mm6              \n\t"
01841 
01842         "paddw 1*8+%3, %%mm4           \n\t" 
01843         "paddw %%mm2, %%mm5            \n\t"
01844 
01845         "paddw (%%"REG_D",%%"REG_a",4), %%mm6   \n\t"
01846         "paddw %%mm2, %%mm4            \n\t"
01847 
01848         "movq %%mm0, (%%"REG_D",%%"REG_a",2)    \n\t"
01849         "psraw $3, %%mm5              \n\t"
01850 
01851         "paddw (%%"REG_D"), %%mm5          \n\t"
01852         "psraw $3, %%mm4              \n\t"
01853 
01854         "paddw (%%"REG_D",%%"REG_a",), %%mm4    \n\t"
01855         "add $"DCTSIZE_S"*2*4, %%"REG_S"      \n\t" 
01856 
01857         "movq %%mm3, (%%"REG_D",%%"REG_d",)     \n\t"
01858         "movq %%mm6, (%%"REG_D",%%"REG_a",4)    \n\t"
01859         "movq %%mm5, (%%"REG_D")           \n\t"
01860         "movq %%mm4, (%%"REG_D",%%"REG_a",)     \n\t"
01861 
01862         "sub %%"REG_d", %%"REG_D"             \n\t"
01863         "add $8, %%"REG_D"               \n\t"
01864         "dec %%"REG_c"                   \n\t"
01865         "jnz 1b                  \n\t"
01866 
01867         : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
01868         : "a"(output_stride*sizeof(short))
01869         : "%"REG_d
01870         );
01871 }
01872 
01873 #endif // HAVE_MMX
01874 
01875 #if !HAVE_MMX
01876 
01877 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
01878 {
01879     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01880     int_simd16_t tmp10, tmp11, tmp12, tmp13;
01881     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
01882     DCTELEM *dataptr;
01883 
01884     cnt*=4;
01885     
01886 
01887     dataptr = data;
01888     for (; cnt > 0; cnt--) {
01889         tmp0 = pixels[line_size*0] + pixels[line_size*7];
01890         tmp7 = pixels[line_size*0] - pixels[line_size*7];
01891         tmp1 = pixels[line_size*1] + pixels[line_size*6];
01892         tmp6 = pixels[line_size*1] - pixels[line_size*6];
01893         tmp2 = pixels[line_size*2] + pixels[line_size*5];
01894         tmp5 = pixels[line_size*2] - pixels[line_size*5];
01895         tmp3 = pixels[line_size*3] + pixels[line_size*4];
01896         tmp4 = pixels[line_size*3] - pixels[line_size*4];
01897 
01898         
01899 
01900         tmp10 = tmp0 + tmp3;
01901         tmp13 = tmp0 - tmp3;
01902         tmp11 = tmp1 + tmp2;
01903         tmp12 = tmp1 - tmp2;
01904         
01905         
01906         
01907         dataptr[2] = tmp10 + tmp11;
01908         dataptr[3] = tmp10 - tmp11;
01909 
01910         z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
01911         dataptr[0] = tmp13 + z1;
01912         dataptr[1] = tmp13 - z1;
01913 
01914         
01915 
01916         tmp10 = (tmp4 + tmp5) <<2;
01917         tmp11 = (tmp5 + tmp6) <<2;
01918         tmp12 = (tmp6 + tmp7) <<2;
01919 
01920         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
01921         z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
01922         z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
01923         z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
01924 
01925         z11 = tmp7 + z3;
01926         z13 = tmp7 - z3;
01927 
01928         dataptr[4] = z13 + z2;
01929         dataptr[5] = z13 - z2;
01930         dataptr[6] = z11 + z4;
01931         dataptr[7] = z11 - z4;
01932 
01933         pixels++;               
01934         dataptr += DCTSIZE;
01935     }
01936 }
01937 
01938 #else 
01939 
01940 static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt)
01941 {
01942     uint64_t __attribute__((aligned(8))) temps[4];
01943     __asm__ volatile(
01944         "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
01945         "6:                     \n\t"
01946         "movd (%%"REG_S"), %%mm0           \n\t"
01947         "pxor %%mm7, %%mm7             \n\t"
01948 
01949         "movd (%%"REG_S",%%"REG_a",), %%mm1     \n\t"
01950         "punpcklbw %%mm7, %%mm0        \n\t"
01951 
01952         "movd (%%"REG_S",%%"REG_a",2), %%mm2    \n\t"
01953         "punpcklbw %%mm7, %%mm1        \n\t"
01954 
01955         "punpcklbw %%mm7, %%mm2        \n\t"
01956         "add %%"REG_d", %%"REG_S"             \n\t"
01957 
01958         "movq %%mm0, %%mm5             \n\t"
01959         
01960 
01961         "movd (%%"REG_S",%%"REG_a",4), %%mm3    \n\t" 
01962         "movq %%mm1, %%mm6             \n\t"
01963 
01964         "movd (%%"REG_S",%%"REG_d",), %%mm4     \n\t" 
01965         "punpcklbw %%mm7, %%mm3        \n\t"
01966 
01967         "psubw %%mm3, %%mm5            \n\t"
01968         "punpcklbw %%mm7, %%mm4        \n\t"
01969 
01970         "paddw %%mm3, %%mm0            \n\t"
01971         "psubw %%mm4, %%mm6            \n\t"
01972 
01973         "movd (%%"REG_S",%%"REG_a",2), %%mm3    \n\t" 
01974         "paddw %%mm4, %%mm1            \n\t"
01975 
01976         "movq %%mm5, 0*8+%3            \n\t" 
01977         "punpcklbw %%mm7, %%mm3        \n\t"
01978 
01979         "movq %%mm6, 1*8+%3            \n\t" 
01980         "movq %%mm2, %%mm4             \n\t"
01981 
01982         "movd (%%"REG_S"), %%mm5           \n\t" 
01983         "paddw %%mm3, %%mm2            \n\t"
01984 
01985         "movd (%%"REG_S",%%"REG_a",), %%mm6     \n\t" 
01986         "punpcklbw %%mm7, %%mm5        \n\t"
01987 
01988         "psubw %%mm3, %%mm4            \n\t"
01989         "punpcklbw %%mm7, %%mm6        \n\t"
01990 
01991         "movq %%mm5, %%mm3             \n\t"
01992         "paddw %%mm6, %%mm5            \n\t" 
01993 
01994         "psubw %%mm6, %%mm3            \n\t" 
01995         "movq %%mm0, %%mm6             \n\t"
01996 
01997         "movq %%mm1, %%mm7             \n\t"
01998         "psubw %%mm5, %%mm0            \n\t" 
01999 
02000         "psubw %%mm2, %%mm1            \n\t"
02001         "paddw %%mm2, %%mm7            \n\t" 
02002 
02003         "paddw %%mm0, %%mm1            \n\t"
02004         "movq %%mm7, %%mm2             \n\t"
02005 
02006         "psllw $2, %%mm1              \n\t"
02007         "paddw %%mm5, %%mm6            \n\t" 
02008 
02009         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
02010         "paddw %%mm6, %%mm7            \n\t" 
02011 
02012         "psubw %%mm2, %%mm6            \n\t" 
02013         "movq %%mm0, %%mm5             \n\t"
02014 
02015         
02016         "movq %%mm7, %%mm2             \n\t"
02017         "punpcklwd %%mm6, %%mm7        \n\t"
02018 
02019         "paddw %%mm1, %%mm0            \n\t" 
02020         "punpckhwd %%mm6, %%mm2        \n\t"
02021 
02022         "psubw %%mm1, %%mm5            \n\t" 
02023         "movq %%mm0, %%mm6             \n\t"
02024 
02025         "movq 1*8+%3, %%mm1            \n\t"
02026         "punpcklwd %%mm5, %%mm0        \n\t"
02027 
02028         "punpckhwd %%mm5, %%mm6        \n\t"
02029         "movq %%mm0, %%mm5             \n\t"
02030 
02031         "punpckldq %%mm7, %%mm0        \n\t" 
02032         "paddw %%mm4, %%mm3            \n\t"
02033 
02034         "punpckhdq %%mm7, %%mm5        \n\t" 
02035         "movq %%mm6, %%mm7             \n\t"
02036 
02037         "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
02038         "punpckldq %%mm2, %%mm6        \n\t" 
02039 
02040         "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
02041         "punpckhdq %%mm2, %%mm7        \n\t" 
02042 
02043         "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
02044         "paddw %%mm1, %%mm4            \n\t"
02045 
02046         "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
02047         "psllw $2, %%mm3              \n\t" 
02048 
02049         "movq 0*8+%3, %%mm2           \n\t"
02050         "psllw $2, %%mm4              \n\t" 
02051 
02052         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" 
02053         "paddw %%mm2, %%mm1            \n\t"
02054 
02055         "psllw $2, %%mm1              \n\t" 
02056         "movq %%mm3, %%mm0             \n\t"
02057 
02058         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
02059         "psubw %%mm1, %%mm3            \n\t"
02060 
02061         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" 
02062         "movq %%mm2, %%mm5             \n\t"
02063 
02064         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
02065         "psubw %%mm4, %%mm2            \n\t" 
02066 
02067         "paddw %%mm4, %%mm5            \n\t" 
02068         "movq %%mm2, %%mm6             \n\t"
02069 
02070         "paddw %%mm3, %%mm0            \n\t" 
02071         "movq %%mm5, %%mm7             \n\t"
02072 
02073         "paddw %%mm0, %%mm2            \n\t" 
02074         "psubw %%mm0, %%mm6            \n\t" 
02075 
02076         "movq %%mm2, %%mm4             \n\t"
02077         "paddw %%mm3, %%mm1            \n\t" 
02078 
02079         
02080         "punpcklwd %%mm6, %%mm2        \n\t"
02081         "paddw %%mm1, %%mm5            \n\t" 
02082 
02083         "punpckhwd %%mm6, %%mm4        \n\t"
02084         "psubw %%mm1, %%mm7            \n\t" 
02085 
02086         "movq %%mm5, %%mm6             \n\t"
02087         "punpcklwd %%mm7, %%mm5        \n\t"
02088 
02089         "punpckhwd %%mm7, %%mm6        \n\t"
02090         "movq %%mm2, %%mm7             \n\t"
02091 
02092         "punpckldq %%mm5, %%mm2        \n\t" 
02093         "sub %%"REG_d", %%"REG_S"             \n\t"
02094 
02095         "punpckhdq %%mm5, %%mm7        \n\t" 
02096         "movq %%mm4, %%mm5             \n\t"
02097 
02098         "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02099         "punpckldq %%mm6, %%mm4        \n\t" 
02100 
02101         "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02102         "punpckhdq %%mm6, %%mm5        \n\t" 
02103 
02104         "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02105         "add $4, %%"REG_S"               \n\t"
02106 
02107         "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02108         "add $"DCTSIZE_S"*2*4, %%"REG_D"      \n\t" 
02109         "dec %%"REG_c"                   \n\t"
02110         "jnz 6b                  \n\t"
02111 
02112         : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
02113         : "a"(line_size)
02114         : "%"REG_d);
02115 }
02116 
02117 #endif // HAVE_MMX