00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 #include "libavcodec/dsputil.h"
00023 #include "libavcodec/simple_idct.h"
00024 #include "dsputil_mmx.h"
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00037 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00038 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00039 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00040 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
00041 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00042 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00043 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00044 
00045 #define ROW_SHIFT 11
00046 #define COL_SHIFT 20 // 6
00047 
00048 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
00049 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
00050 
00051 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
00052         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
00053 
00054 
00055         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
00056         
00057 
00058 
00059 
00060  C4,  C4,  C4,  C4,
00061  C4, -C4,  C4, -C4,
00062 
00063  C2,  C6,  C2,  C6,
00064  C6, -C2,  C6, -C2,
00065 
00066  C1,  C3,  C1,  C3,
00067  C5,  C7,  C5,  C7,
00068 
00069  C3, -C7,  C3, -C7,
00070 -C1, -C5, -C1, -C5,
00071 
00072  C5, -C1,  C5, -C1,
00073  C7,  C3,  C7,  C3,
00074 
00075  C7, -C5,  C7, -C5,
00076  C3, -C1,  C3, -C1
00077 };
00078 
00079 static inline void idct(int16_t *block)
00080 {
00081         DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
00082         int16_t * const temp= (int16_t*)align_tmp;
00083 
00084         __asm__ volatile(
00085 #if 0 //Alternative, simpler variant
00086 
00087 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00088         "movq " #src0 ", %%mm0          \n\t" \
00089         "movq " #src4 ", %%mm1          \n\t" \
00090         "movq " #src1 ", %%mm2          \n\t" \
00091         "movq " #src5 ", %%mm3          \n\t" \
00092         "movq 16(%2), %%mm4             \n\t" \
00093         "pmaddwd %%mm0, %%mm4           \n\t" \
00094         "movq 24(%2), %%mm5             \n\t" \
00095         "pmaddwd %%mm5, %%mm0           \n\t" \
00096         "movq 32(%2), %%mm5             \n\t" \
00097         "pmaddwd %%mm1, %%mm5           \n\t" \
00098         "movq 40(%2), %%mm6             \n\t" \
00099         "pmaddwd %%mm6, %%mm1           \n\t" \
00100         "movq 48(%2), %%mm7             \n\t" \
00101         "pmaddwd %%mm2, %%mm7           \n\t" \
00102         #rounder ", %%mm4               \n\t"\
00103         "movq %%mm4, %%mm6              \n\t" \
00104         "paddd %%mm5, %%mm4             \n\t" \
00105         "psubd %%mm5, %%mm6             \n\t" \
00106         "movq 56(%2), %%mm5             \n\t" \
00107         "pmaddwd %%mm3, %%mm5           \n\t" \
00108         #rounder ", %%mm0               \n\t"\
00109         "paddd %%mm0, %%mm1             \n\t" \
00110         "paddd %%mm0, %%mm0             \n\t" \
00111         "psubd %%mm1, %%mm0             \n\t" \
00112         "pmaddwd 64(%2), %%mm2          \n\t" \
00113         "paddd %%mm5, %%mm7             \n\t" \
00114         "movq 72(%2), %%mm5             \n\t" \
00115         "pmaddwd %%mm3, %%mm5           \n\t" \
00116         "paddd %%mm4, %%mm7             \n\t" \
00117         "paddd %%mm4, %%mm4             \n\t" \
00118         "psubd %%mm7, %%mm4             \n\t" \
00119         "paddd %%mm2, %%mm5             \n\t" \
00120         "psrad $" #shift ", %%mm7       \n\t"\
00121         "psrad $" #shift ", %%mm4       \n\t"\
00122         "movq %%mm1, %%mm2              \n\t" \
00123         "paddd %%mm5, %%mm1             \n\t" \
00124         "psubd %%mm5, %%mm2             \n\t" \
00125         "psrad $" #shift ", %%mm1       \n\t"\
00126         "psrad $" #shift ", %%mm2       \n\t"\
00127         "packssdw %%mm1, %%mm7          \n\t" \
00128         "packssdw %%mm4, %%mm2          \n\t" \
00129         "movq %%mm7, " #dst "           \n\t"\
00130         "movq " #src1 ", %%mm1          \n\t" \
00131         "movq 80(%2), %%mm4             \n\t" \
00132         "movq %%mm2, 24+" #dst "        \n\t"\
00133         "pmaddwd %%mm1, %%mm4           \n\t" \
00134         "movq 88(%2), %%mm7             \n\t" \
00135         "pmaddwd 96(%2), %%mm1          \n\t" \
00136         "pmaddwd %%mm3, %%mm7           \n\t" \
00137         "movq %%mm0, %%mm2              \n\t" \
00138         "pmaddwd 104(%2), %%mm3         \n\t" \
00139         "paddd %%mm7, %%mm4             \n\t" \
00140         "paddd %%mm4, %%mm2             \n\t" \
00141         "psubd %%mm4, %%mm0             \n\t" \
00142         "psrad $" #shift ", %%mm2       \n\t"\
00143         "psrad $" #shift ", %%mm0       \n\t"\
00144         "movq %%mm6, %%mm4              \n\t" \
00145         "paddd %%mm1, %%mm3             \n\t" \
00146         "paddd %%mm3, %%mm6             \n\t" \
00147         "psubd %%mm3, %%mm4             \n\t" \
00148         "psrad $" #shift ", %%mm6       \n\t"\
00149         "packssdw %%mm6, %%mm2          \n\t" \
00150         "movq %%mm2, 8+" #dst "         \n\t"\
00151         "psrad $" #shift ", %%mm4       \n\t"\
00152         "packssdw %%mm0, %%mm4          \n\t" \
00153         "movq %%mm4, 16+" #dst "        \n\t"\
00154 
00155 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
00156         "movq " #src0 ", %%mm0          \n\t" \
00157         "movq " #src4 ", %%mm1          \n\t" \
00158         "movq " #src1 ", %%mm2          \n\t" \
00159         "movq " #src5 ", %%mm3          \n\t" \
00160         "movq 16(%2), %%mm4             \n\t" \
00161         "pmaddwd %%mm0, %%mm4           \n\t" \
00162         "movq 24(%2), %%mm5             \n\t" \
00163         "pmaddwd %%mm5, %%mm0           \n\t" \
00164         "movq 32(%2), %%mm5             \n\t" \
00165         "pmaddwd %%mm1, %%mm5           \n\t" \
00166         "movq 40(%2), %%mm6             \n\t" \
00167         "pmaddwd %%mm6, %%mm1           \n\t" \
00168         "movq %%mm4, %%mm6              \n\t" \
00169         "movq 48(%2), %%mm7             \n\t" \
00170         "pmaddwd %%mm2, %%mm7           \n\t" \
00171         "paddd %%mm5, %%mm4             \n\t" \
00172         "psubd %%mm5, %%mm6             \n\t" \
00173         "movq %%mm0, %%mm5              \n\t" \
00174         "paddd %%mm1, %%mm0             \n\t" \
00175         "psubd %%mm1, %%mm5             \n\t" \
00176         "movq 56(%2), %%mm1             \n\t" \
00177         "pmaddwd %%mm3, %%mm1           \n\t" \
00178         "pmaddwd 64(%2), %%mm2          \n\t" \
00179         "paddd %%mm1, %%mm7             \n\t" \
00180         "movq 72(%2), %%mm1             \n\t" \
00181         "pmaddwd %%mm3, %%mm1           \n\t" \
00182         "paddd %%mm4, %%mm7             \n\t" \
00183         "paddd %%mm4, %%mm4             \n\t" \
00184         "psubd %%mm7, %%mm4             \n\t" \
00185         "paddd %%mm2, %%mm1             \n\t" \
00186         "psrad $" #shift ", %%mm7       \n\t"\
00187         "psrad $" #shift ", %%mm4       \n\t"\
00188         "movq %%mm0, %%mm2              \n\t" \
00189         "paddd %%mm1, %%mm0             \n\t" \
00190         "psubd %%mm1, %%mm2             \n\t" \
00191         "psrad $" #shift ", %%mm0       \n\t"\
00192         "psrad $" #shift ", %%mm2       \n\t"\
00193         "packssdw %%mm7, %%mm7          \n\t" \
00194         "movd %%mm7, " #dst "           \n\t"\
00195         "packssdw %%mm0, %%mm0          \n\t" \
00196         "movd %%mm0, 16+" #dst "        \n\t"\
00197         "packssdw %%mm2, %%mm2          \n\t" \
00198         "movd %%mm2, 96+" #dst "        \n\t"\
00199         "packssdw %%mm4, %%mm4          \n\t" \
00200         "movd %%mm4, 112+" #dst "       \n\t"\
00201         "movq " #src1 ", %%mm0          \n\t" \
00202         "movq 80(%2), %%mm4             \n\t" \
00203         "pmaddwd %%mm0, %%mm4           \n\t" \
00204         "movq 88(%2), %%mm7             \n\t" \
00205         "pmaddwd 96(%2), %%mm0          \n\t" \
00206         "pmaddwd %%mm3, %%mm7           \n\t" \
00207         "movq %%mm5, %%mm2              \n\t" \
00208         "pmaddwd 104(%2), %%mm3         \n\t" \
00209         "paddd %%mm7, %%mm4             \n\t" \
00210         "paddd %%mm4, %%mm2             \n\t" \
00211         "psubd %%mm4, %%mm5             \n\t" \
00212         "psrad $" #shift ", %%mm2       \n\t"\
00213         "psrad $" #shift ", %%mm5       \n\t"\
00214         "movq %%mm6, %%mm4              \n\t" \
00215         "paddd %%mm0, %%mm3             \n\t" \
00216         "paddd %%mm3, %%mm6             \n\t" \
00217         "psubd %%mm3, %%mm4             \n\t" \
00218         "psrad $" #shift ", %%mm6       \n\t"\
00219         "psrad $" #shift ", %%mm4       \n\t"\
00220         "packssdw %%mm2, %%mm2          \n\t" \
00221         "packssdw %%mm6, %%mm6          \n\t" \
00222         "movd %%mm2, 32+" #dst "        \n\t"\
00223         "packssdw %%mm4, %%mm4          \n\t" \
00224         "packssdw %%mm5, %%mm5          \n\t" \
00225         "movd %%mm6, 48+" #dst "        \n\t"\
00226         "movd %%mm4, 64+" #dst "        \n\t"\
00227         "movd %%mm5, 80+" #dst "        \n\t"\
00228 
00229 
00230 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00231         "movq " #src0 ", %%mm0          \n\t" \
00232         "movq " #src4 ", %%mm1          \n\t" \
00233         "movq " #src1 ", %%mm2          \n\t" \
00234         "movq " #src5 ", %%mm3          \n\t" \
00235         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00236         "pand %%mm0, %%mm4              \n\t"\
00237         "por %%mm1, %%mm4               \n\t"\
00238         "por %%mm2, %%mm4               \n\t"\
00239         "por %%mm3, %%mm4               \n\t"\
00240         "packssdw %%mm4,%%mm4           \n\t"\
00241         "movd %%mm4, %%eax              \n\t"\
00242         "orl %%eax, %%eax               \n\t"\
00243         "jz 1f                          \n\t"\
00244         "movq 16(%2), %%mm4             \n\t" \
00245         "pmaddwd %%mm0, %%mm4           \n\t" \
00246         "movq 24(%2), %%mm5             \n\t" \
00247         "pmaddwd %%mm5, %%mm0           \n\t" \
00248         "movq 32(%2), %%mm5             \n\t" \
00249         "pmaddwd %%mm1, %%mm5           \n\t" \
00250         "movq 40(%2), %%mm6             \n\t" \
00251         "pmaddwd %%mm6, %%mm1           \n\t" \
00252         "movq 48(%2), %%mm7             \n\t" \
00253         "pmaddwd %%mm2, %%mm7           \n\t" \
00254         #rounder ", %%mm4               \n\t"\
00255         "movq %%mm4, %%mm6              \n\t" \
00256         "paddd %%mm5, %%mm4             \n\t" \
00257         "psubd %%mm5, %%mm6             \n\t" \
00258         "movq 56(%2), %%mm5             \n\t" \
00259         "pmaddwd %%mm3, %%mm5           \n\t" \
00260         #rounder ", %%mm0               \n\t"\
00261         "paddd %%mm0, %%mm1             \n\t" \
00262         "paddd %%mm0, %%mm0             \n\t" \
00263         "psubd %%mm1, %%mm0             \n\t" \
00264         "pmaddwd 64(%2), %%mm2          \n\t" \
00265         "paddd %%mm5, %%mm7             \n\t" \
00266         "movq 72(%2), %%mm5             \n\t" \
00267         "pmaddwd %%mm3, %%mm5           \n\t" \
00268         "paddd %%mm4, %%mm7             \n\t" \
00269         "paddd %%mm4, %%mm4             \n\t" \
00270         "psubd %%mm7, %%mm4             \n\t" \
00271         "paddd %%mm2, %%mm5             \n\t" \
00272         "psrad $" #shift ", %%mm7       \n\t"\
00273         "psrad $" #shift ", %%mm4       \n\t"\
00274         "movq %%mm1, %%mm2              \n\t" \
00275         "paddd %%mm5, %%mm1             \n\t" \
00276         "psubd %%mm5, %%mm2             \n\t" \
00277         "psrad $" #shift ", %%mm1       \n\t"\
00278         "psrad $" #shift ", %%mm2       \n\t"\
00279         "packssdw %%mm1, %%mm7          \n\t" \
00280         "packssdw %%mm4, %%mm2          \n\t" \
00281         "movq %%mm7, " #dst "           \n\t"\
00282         "movq " #src1 ", %%mm1          \n\t" \
00283         "movq 80(%2), %%mm4             \n\t" \
00284         "movq %%mm2, 24+" #dst "        \n\t"\
00285         "pmaddwd %%mm1, %%mm4           \n\t" \
00286         "movq 88(%2), %%mm7             \n\t" \
00287         "pmaddwd 96(%2), %%mm1          \n\t" \
00288         "pmaddwd %%mm3, %%mm7           \n\t" \
00289         "movq %%mm0, %%mm2              \n\t" \
00290         "pmaddwd 104(%2), %%mm3         \n\t" \
00291         "paddd %%mm7, %%mm4             \n\t" \
00292         "paddd %%mm4, %%mm2             \n\t" \
00293         "psubd %%mm4, %%mm0             \n\t" \
00294         "psrad $" #shift ", %%mm2       \n\t"\
00295         "psrad $" #shift ", %%mm0       \n\t"\
00296         "movq %%mm6, %%mm4              \n\t" \
00297         "paddd %%mm1, %%mm3             \n\t" \
00298         "paddd %%mm3, %%mm6             \n\t" \
00299         "psubd %%mm3, %%mm4             \n\t" \
00300         "psrad $" #shift ", %%mm6       \n\t"\
00301         "packssdw %%mm6, %%mm2          \n\t" \
00302         "movq %%mm2, 8+" #dst "         \n\t"\
00303         "psrad $" #shift ", %%mm4       \n\t"\
00304         "packssdw %%mm0, %%mm4          \n\t" \
00305         "movq %%mm4, 16+" #dst "        \n\t"\
00306         "jmp 2f                         \n\t"\
00307         "1:                             \n\t"\
00308         "pslld $16, %%mm0               \n\t"\
00309         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
00310         "psrad $13, %%mm0               \n\t"\
00311         "packssdw %%mm0, %%mm0          \n\t"\
00312         "movq %%mm0, " #dst "           \n\t"\
00313         "movq %%mm0, 8+" #dst "         \n\t"\
00314         "movq %%mm0, 16+" #dst "        \n\t"\
00315         "movq %%mm0, 24+" #dst "        \n\t"\
00316         "2:                             \n\t"
00317 
00318 
00319 
00320 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00321 
00322 
00323 
00324 
00325 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
00326 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
00327 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
00328 
00329 
00330 
00331 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00332 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00333 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00334 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00335 
00336 #else
00337 
00338 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00339         "movq " #src0 ", %%mm0          \n\t" \
00340         "movq " #src4 ", %%mm1          \n\t" \
00341         "movq " #src1 ", %%mm2          \n\t" \
00342         "movq " #src5 ", %%mm3          \n\t" \
00343         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
00344         "pand %%mm0, %%mm4              \n\t"\
00345         "por %%mm1, %%mm4               \n\t"\
00346         "por %%mm2, %%mm4               \n\t"\
00347         "por %%mm3, %%mm4               \n\t"\
00348         "packssdw %%mm4,%%mm4           \n\t"\
00349         "movd %%mm4, %%eax              \n\t"\
00350         "orl %%eax, %%eax               \n\t"\
00351         "jz 1f                          \n\t"\
00352         "movq 16(%2), %%mm4             \n\t" \
00353         "pmaddwd %%mm0, %%mm4           \n\t" \
00354         "movq 24(%2), %%mm5             \n\t" \
00355         "pmaddwd %%mm5, %%mm0           \n\t" \
00356         "movq 32(%2), %%mm5             \n\t" \
00357         "pmaddwd %%mm1, %%mm5           \n\t" \
00358         "movq 40(%2), %%mm6             \n\t" \
00359         "pmaddwd %%mm6, %%mm1           \n\t" \
00360         "movq 48(%2), %%mm7             \n\t" \
00361         "pmaddwd %%mm2, %%mm7           \n\t" \
00362         #rounder ", %%mm4               \n\t"\
00363         "movq %%mm4, %%mm6              \n\t" \
00364         "paddd %%mm5, %%mm4             \n\t" \
00365         "psubd %%mm5, %%mm6             \n\t" \
00366         "movq 56(%2), %%mm5             \n\t" \
00367         "pmaddwd %%mm3, %%mm5           \n\t" \
00368         #rounder ", %%mm0               \n\t"\
00369         "paddd %%mm0, %%mm1             \n\t" \
00370         "paddd %%mm0, %%mm0             \n\t" \
00371         "psubd %%mm1, %%mm0             \n\t" \
00372         "pmaddwd 64(%2), %%mm2          \n\t" \
00373         "paddd %%mm5, %%mm7             \n\t" \
00374         "movq 72(%2), %%mm5             \n\t" \
00375         "pmaddwd %%mm3, %%mm5           \n\t" \
00376         "paddd %%mm4, %%mm7             \n\t" \
00377         "paddd %%mm4, %%mm4             \n\t" \
00378         "psubd %%mm7, %%mm4             \n\t" \
00379         "paddd %%mm2, %%mm5             \n\t" \
00380         "psrad $" #shift ", %%mm7       \n\t"\
00381         "psrad $" #shift ", %%mm4       \n\t"\
00382         "movq %%mm1, %%mm2              \n\t" \
00383         "paddd %%mm5, %%mm1             \n\t" \
00384         "psubd %%mm5, %%mm2             \n\t" \
00385         "psrad $" #shift ", %%mm1       \n\t"\
00386         "psrad $" #shift ", %%mm2       \n\t"\
00387         "packssdw %%mm1, %%mm7          \n\t" \
00388         "packssdw %%mm4, %%mm2          \n\t" \
00389         "movq %%mm7, " #dst "           \n\t"\
00390         "movq " #src1 ", %%mm1          \n\t" \
00391         "movq 80(%2), %%mm4             \n\t" \
00392         "movq %%mm2, 24+" #dst "        \n\t"\
00393         "pmaddwd %%mm1, %%mm4           \n\t" \
00394         "movq 88(%2), %%mm7             \n\t" \
00395         "pmaddwd 96(%2), %%mm1          \n\t" \
00396         "pmaddwd %%mm3, %%mm7           \n\t" \
00397         "movq %%mm0, %%mm2              \n\t" \
00398         "pmaddwd 104(%2), %%mm3         \n\t" \
00399         "paddd %%mm7, %%mm4             \n\t" \
00400         "paddd %%mm4, %%mm2             \n\t" \
00401         "psubd %%mm4, %%mm0             \n\t" \
00402         "psrad $" #shift ", %%mm2       \n\t"\
00403         "psrad $" #shift ", %%mm0       \n\t"\
00404         "movq %%mm6, %%mm4              \n\t" \
00405         "paddd %%mm1, %%mm3             \n\t" \
00406         "paddd %%mm3, %%mm6             \n\t" \
00407         "psubd %%mm3, %%mm4             \n\t" \
00408         "psrad $" #shift ", %%mm6       \n\t"\
00409         "packssdw %%mm6, %%mm2          \n\t" \
00410         "movq %%mm2, 8+" #dst "         \n\t"\
00411         "psrad $" #shift ", %%mm4       \n\t"\
00412         "packssdw %%mm0, %%mm4          \n\t" \
00413         "movq %%mm4, 16+" #dst "        \n\t"\
00414         "jmp 2f                         \n\t"\
00415         "1:                             \n\t"\
00416         "pslld $16, %%mm0               \n\t"\
00417         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
00418         "psrad $13, %%mm0               \n\t"\
00419         "packssdw %%mm0, %%mm0          \n\t"\
00420         "movq %%mm0, " #dst "           \n\t"\
00421         "movq %%mm0, 8+" #dst "         \n\t"\
00422         "movq %%mm0, 16+" #dst "        \n\t"\
00423         "movq %%mm0, 24+" #dst "        \n\t"\
00424         "2:                             \n\t"
00425 
00426 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
00427         "movq " #src0 ", %%mm0          \n\t" \
00428         "movq " #src4 ", %%mm1          \n\t" \
00429         "movq " #src1 ", %%mm2          \n\t" \
00430         "movq " #src5 ", %%mm3          \n\t" \
00431         "movq %%mm0, %%mm4              \n\t"\
00432         "por %%mm1, %%mm4               \n\t"\
00433         "por %%mm2, %%mm4               \n\t"\
00434         "por %%mm3, %%mm4               \n\t"\
00435         "packssdw %%mm4,%%mm4           \n\t"\
00436         "movd %%mm4, %%eax              \n\t"\
00437         "orl %%eax, %%eax               \n\t"\
00438         "jz " #bt "                     \n\t"\
00439         "movq 16(%2), %%mm4             \n\t" \
00440         "pmaddwd %%mm0, %%mm4           \n\t" \
00441         "movq 24(%2), %%mm5             \n\t" \
00442         "pmaddwd %%mm5, %%mm0           \n\t" \
00443         "movq 32(%2), %%mm5             \n\t" \
00444         "pmaddwd %%mm1, %%mm5           \n\t" \
00445         "movq 40(%2), %%mm6             \n\t" \
00446         "pmaddwd %%mm6, %%mm1           \n\t" \
00447         "movq 48(%2), %%mm7             \n\t" \
00448         "pmaddwd %%mm2, %%mm7           \n\t" \
00449         #rounder ", %%mm4               \n\t"\
00450         "movq %%mm4, %%mm6              \n\t" \
00451         "paddd %%mm5, %%mm4             \n\t" \
00452         "psubd %%mm5, %%mm6             \n\t" \
00453         "movq 56(%2), %%mm5             \n\t" \
00454         "pmaddwd %%mm3, %%mm5           \n\t" \
00455         #rounder ", %%mm0               \n\t"\
00456         "paddd %%mm0, %%mm1             \n\t" \
00457         "paddd %%mm0, %%mm0             \n\t" \
00458         "psubd %%mm1, %%mm0             \n\t" \
00459         "pmaddwd 64(%2), %%mm2          \n\t" \
00460         "paddd %%mm5, %%mm7             \n\t" \
00461         "movq 72(%2), %%mm5             \n\t" \
00462         "pmaddwd %%mm3, %%mm5           \n\t" \
00463         "paddd %%mm4, %%mm7             \n\t" \
00464         "paddd %%mm4, %%mm4             \n\t" \
00465         "psubd %%mm7, %%mm4             \n\t" \
00466         "paddd %%mm2, %%mm5             \n\t" \
00467         "psrad $" #shift ", %%mm7       \n\t"\
00468         "psrad $" #shift ", %%mm4       \n\t"\
00469         "movq %%mm1, %%mm2              \n\t" \
00470         "paddd %%mm5, %%mm1             \n\t" \
00471         "psubd %%mm5, %%mm2             \n\t" \
00472         "psrad $" #shift ", %%mm1       \n\t"\
00473         "psrad $" #shift ", %%mm2       \n\t"\
00474         "packssdw %%mm1, %%mm7          \n\t" \
00475         "packssdw %%mm4, %%mm2          \n\t" \
00476         "movq %%mm7, " #dst "           \n\t"\
00477         "movq " #src1 ", %%mm1          \n\t" \
00478         "movq 80(%2), %%mm4             \n\t" \
00479         "movq %%mm2, 24+" #dst "        \n\t"\
00480         "pmaddwd %%mm1, %%mm4           \n\t" \
00481         "movq 88(%2), %%mm7             \n\t" \
00482         "pmaddwd 96(%2), %%mm1          \n\t" \
00483         "pmaddwd %%mm3, %%mm7           \n\t" \
00484         "movq %%mm0, %%mm2              \n\t" \
00485         "pmaddwd 104(%2), %%mm3         \n\t" \
00486         "paddd %%mm7, %%mm4             \n\t" \
00487         "paddd %%mm4, %%mm2             \n\t" \
00488         "psubd %%mm4, %%mm0             \n\t" \
00489         "psrad $" #shift ", %%mm2       \n\t"\
00490         "psrad $" #shift ", %%mm0       \n\t"\
00491         "movq %%mm6, %%mm4              \n\t" \
00492         "paddd %%mm1, %%mm3             \n\t" \
00493         "paddd %%mm3, %%mm6             \n\t" \
00494         "psubd %%mm3, %%mm4             \n\t" \
00495         "psrad $" #shift ", %%mm6       \n\t"\
00496         "packssdw %%mm6, %%mm2          \n\t" \
00497         "movq %%mm2, 8+" #dst "         \n\t"\
00498         "psrad $" #shift ", %%mm4       \n\t"\
00499         "packssdw %%mm0, %%mm4          \n\t" \
00500         "movq %%mm4, 16+" #dst "        \n\t"\
00501 
00502 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00503         "movq " #src0 ", %%mm0          \n\t" \
00504         "movq " #src4 ", %%mm1          \n\t" \
00505         "movq " #src1 ", %%mm2          \n\t" \
00506         "movq " #src5 ", %%mm3          \n\t" \
00507         "movq 16(%2), %%mm4             \n\t" \
00508         "pmaddwd %%mm0, %%mm4           \n\t" \
00509         "movq 24(%2), %%mm5             \n\t" \
00510         "pmaddwd %%mm5, %%mm0           \n\t" \
00511         "movq 32(%2), %%mm5             \n\t" \
00512         "pmaddwd %%mm1, %%mm5           \n\t" \
00513         "movq 40(%2), %%mm6             \n\t" \
00514         "pmaddwd %%mm6, %%mm1           \n\t" \
00515         "movq 48(%2), %%mm7             \n\t" \
00516         "pmaddwd %%mm2, %%mm7           \n\t" \
00517         #rounder ", %%mm4               \n\t"\
00518         "movq %%mm4, %%mm6              \n\t" \
00519         "paddd %%mm5, %%mm4             \n\t" \
00520         "psubd %%mm5, %%mm6             \n\t" \
00521         "movq 56(%2), %%mm5             \n\t" \
00522         "pmaddwd %%mm3, %%mm5           \n\t" \
00523         #rounder ", %%mm0               \n\t"\
00524         "paddd %%mm0, %%mm1             \n\t" \
00525         "paddd %%mm0, %%mm0             \n\t" \
00526         "psubd %%mm1, %%mm0             \n\t" \
00527         "pmaddwd 64(%2), %%mm2          \n\t" \
00528         "paddd %%mm5, %%mm7             \n\t" \
00529         "movq 72(%2), %%mm5             \n\t" \
00530         "pmaddwd %%mm3, %%mm5           \n\t" \
00531         "paddd %%mm4, %%mm7             \n\t" \
00532         "paddd %%mm4, %%mm4             \n\t" \
00533         "psubd %%mm7, %%mm4             \n\t" \
00534         "paddd %%mm2, %%mm5             \n\t" \
00535         "psrad $" #shift ", %%mm7       \n\t"\
00536         "psrad $" #shift ", %%mm4       \n\t"\
00537         "movq %%mm1, %%mm2              \n\t" \
00538         "paddd %%mm5, %%mm1             \n\t" \
00539         "psubd %%mm5, %%mm2             \n\t" \
00540         "psrad $" #shift ", %%mm1       \n\t"\
00541         "psrad $" #shift ", %%mm2       \n\t"\
00542         "packssdw %%mm1, %%mm7          \n\t" \
00543         "packssdw %%mm4, %%mm2          \n\t" \
00544         "movq %%mm7, " #dst "           \n\t"\
00545         "movq " #src1 ", %%mm1          \n\t" \
00546         "movq 80(%2), %%mm4             \n\t" \
00547         "movq %%mm2, 24+" #dst "        \n\t"\
00548         "pmaddwd %%mm1, %%mm4           \n\t" \
00549         "movq 88(%2), %%mm7             \n\t" \
00550         "pmaddwd 96(%2), %%mm1          \n\t" \
00551         "pmaddwd %%mm3, %%mm7           \n\t" \
00552         "movq %%mm0, %%mm2              \n\t" \
00553         "pmaddwd 104(%2), %%mm3         \n\t" \
00554         "paddd %%mm7, %%mm4             \n\t" \
00555         "paddd %%mm4, %%mm2             \n\t" \
00556         "psubd %%mm4, %%mm0             \n\t" \
00557         "psrad $" #shift ", %%mm2       \n\t"\
00558         "psrad $" #shift ", %%mm0       \n\t"\
00559         "movq %%mm6, %%mm4              \n\t" \
00560         "paddd %%mm1, %%mm3             \n\t" \
00561         "paddd %%mm3, %%mm6             \n\t" \
00562         "psubd %%mm3, %%mm4             \n\t" \
00563         "psrad $" #shift ", %%mm6       \n\t"\
00564         "packssdw %%mm6, %%mm2          \n\t" \
00565         "movq %%mm2, 8+" #dst "         \n\t"\
00566         "psrad $" #shift ", %%mm4       \n\t"\
00567         "packssdw %%mm0, %%mm4          \n\t" \
00568         "movq %%mm4, 16+" #dst "        \n\t"\
00569 
00570 
00571 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
00572 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
00573 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
00574 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
00575 
00576 #undef IDCT
00577 #define IDCT(src0, src4, src1, src5, dst, shift) \
00578         "movq " #src0 ", %%mm0          \n\t" \
00579         "movq " #src4 ", %%mm1          \n\t" \
00580         "movq " #src1 ", %%mm2          \n\t" \
00581         "movq " #src5 ", %%mm3          \n\t" \
00582         "movq 16(%2), %%mm4             \n\t" \
00583         "pmaddwd %%mm0, %%mm4           \n\t" \
00584         "movq 24(%2), %%mm5             \n\t" \
00585         "pmaddwd %%mm5, %%mm0           \n\t" \
00586         "movq 32(%2), %%mm5             \n\t" \
00587         "pmaddwd %%mm1, %%mm5           \n\t" \
00588         "movq 40(%2), %%mm6             \n\t" \
00589         "pmaddwd %%mm6, %%mm1           \n\t" \
00590         "movq %%mm4, %%mm6              \n\t" \
00591         "movq 48(%2), %%mm7             \n\t" \
00592         "pmaddwd %%mm2, %%mm7           \n\t" \
00593         "paddd %%mm5, %%mm4             \n\t" \
00594         "psubd %%mm5, %%mm6             \n\t" \
00595         "movq %%mm0, %%mm5              \n\t" \
00596         "paddd %%mm1, %%mm0             \n\t" \
00597         "psubd %%mm1, %%mm5             \n\t" \
00598         "movq 56(%2), %%mm1             \n\t" \
00599         "pmaddwd %%mm3, %%mm1           \n\t" \
00600         "pmaddwd 64(%2), %%mm2          \n\t" \
00601         "paddd %%mm1, %%mm7             \n\t" \
00602         "movq 72(%2), %%mm1             \n\t" \
00603         "pmaddwd %%mm3, %%mm1           \n\t" \
00604         "paddd %%mm4, %%mm7             \n\t" \
00605         "paddd %%mm4, %%mm4             \n\t" \
00606         "psubd %%mm7, %%mm4             \n\t" \
00607         "paddd %%mm2, %%mm1             \n\t" \
00608         "psrad $" #shift ", %%mm7       \n\t"\
00609         "psrad $" #shift ", %%mm4       \n\t"\
00610         "movq %%mm0, %%mm2              \n\t" \
00611         "paddd %%mm1, %%mm0             \n\t" \
00612         "psubd %%mm1, %%mm2             \n\t" \
00613         "psrad $" #shift ", %%mm0       \n\t"\
00614         "psrad $" #shift ", %%mm2       \n\t"\
00615         "packssdw %%mm7, %%mm7          \n\t" \
00616         "movd %%mm7, " #dst "           \n\t"\
00617         "packssdw %%mm0, %%mm0          \n\t" \
00618         "movd %%mm0, 16+" #dst "        \n\t"\
00619         "packssdw %%mm2, %%mm2          \n\t" \
00620         "movd %%mm2, 96+" #dst "        \n\t"\
00621         "packssdw %%mm4, %%mm4          \n\t" \
00622         "movd %%mm4, 112+" #dst "       \n\t"\
00623         "movq " #src1 ", %%mm0          \n\t" \
00624         "movq 80(%2), %%mm4             \n\t" \
00625         "pmaddwd %%mm0, %%mm4           \n\t" \
00626         "movq 88(%2), %%mm7             \n\t" \
00627         "pmaddwd 96(%2), %%mm0          \n\t" \
00628         "pmaddwd %%mm3, %%mm7           \n\t" \
00629         "movq %%mm5, %%mm2              \n\t" \
00630         "pmaddwd 104(%2), %%mm3         \n\t" \
00631         "paddd %%mm7, %%mm4             \n\t" \
00632         "paddd %%mm4, %%mm2             \n\t" \
00633         "psubd %%mm4, %%mm5             \n\t" \
00634         "psrad $" #shift ", %%mm2       \n\t"\
00635         "psrad $" #shift ", %%mm5       \n\t"\
00636         "movq %%mm6, %%mm4              \n\t" \
00637         "paddd %%mm0, %%mm3             \n\t" \
00638         "paddd %%mm3, %%mm6             \n\t" \
00639         "psubd %%mm3, %%mm4             \n\t" \
00640         "psrad $" #shift ", %%mm6       \n\t"\
00641         "psrad $" #shift ", %%mm4       \n\t"\
00642         "packssdw %%mm2, %%mm2          \n\t" \
00643         "packssdw %%mm6, %%mm6          \n\t" \
00644         "movd %%mm2, 32+" #dst "        \n\t"\
00645         "packssdw %%mm4, %%mm4          \n\t" \
00646         "packssdw %%mm5, %%mm5          \n\t" \
00647         "movd %%mm6, 48+" #dst "        \n\t"\
00648         "movd %%mm4, 64+" #dst "        \n\t"\
00649         "movd %%mm5, 80+" #dst "        \n\t"
00650 
00651 
00652 
00653 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00654 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00655 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00656 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00657         "jmp 9f                         \n\t"
00658 
00659         "# .p2align 4                   \n\t"\
00660         "4:                             \n\t"
00661 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
00662 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
00663 
00664 #undef IDCT
00665 #define IDCT(src0, src4, src1, src5, dst, shift) \
00666         "movq " #src0 ", %%mm0          \n\t" \
00667         "movq " #src4 ", %%mm1          \n\t" \
00668         "movq " #src5 ", %%mm3          \n\t" \
00669         "movq 16(%2), %%mm4             \n\t" \
00670         "pmaddwd %%mm0, %%mm4           \n\t" \
00671         "movq 24(%2), %%mm5             \n\t" \
00672         "pmaddwd %%mm5, %%mm0           \n\t" \
00673         "movq 32(%2), %%mm5             \n\t" \
00674         "pmaddwd %%mm1, %%mm5           \n\t" \
00675         "movq 40(%2), %%mm6             \n\t" \
00676         "pmaddwd %%mm6, %%mm1           \n\t" \
00677         "movq %%mm4, %%mm6              \n\t" \
00678         "paddd %%mm5, %%mm4             \n\t" \
00679         "psubd %%mm5, %%mm6             \n\t" \
00680         "movq %%mm0, %%mm5              \n\t" \
00681         "paddd %%mm1, %%mm0             \n\t" \
00682         "psubd %%mm1, %%mm5             \n\t" \
00683         "movq 56(%2), %%mm1             \n\t" \
00684         "pmaddwd %%mm3, %%mm1           \n\t" \
00685         "movq 72(%2), %%mm7             \n\t" \
00686         "pmaddwd %%mm3, %%mm7           \n\t" \
00687         "paddd %%mm4, %%mm1             \n\t" \
00688         "paddd %%mm4, %%mm4             \n\t" \
00689         "psubd %%mm1, %%mm4             \n\t" \
00690         "psrad $" #shift ", %%mm1       \n\t"\
00691         "psrad $" #shift ", %%mm4       \n\t"\
00692         "movq %%mm0, %%mm2              \n\t" \
00693         "paddd %%mm7, %%mm0             \n\t" \
00694         "psubd %%mm7, %%mm2             \n\t" \
00695         "psrad $" #shift ", %%mm0       \n\t"\
00696         "psrad $" #shift ", %%mm2       \n\t"\
00697         "packssdw %%mm1, %%mm1          \n\t" \
00698         "movd %%mm1, " #dst "           \n\t"\
00699         "packssdw %%mm0, %%mm0          \n\t" \
00700         "movd %%mm0, 16+" #dst "        \n\t"\
00701         "packssdw %%mm2, %%mm2          \n\t" \
00702         "movd %%mm2, 96+" #dst "        \n\t"\
00703         "packssdw %%mm4, %%mm4          \n\t" \
00704         "movd %%mm4, 112+" #dst "       \n\t"\
00705         "movq 88(%2), %%mm1             \n\t" \
00706         "pmaddwd %%mm3, %%mm1           \n\t" \
00707         "movq %%mm5, %%mm2              \n\t" \
00708         "pmaddwd 104(%2), %%mm3         \n\t" \
00709         "paddd %%mm1, %%mm2             \n\t" \
00710         "psubd %%mm1, %%mm5             \n\t" \
00711         "psrad $" #shift ", %%mm2       \n\t"\
00712         "psrad $" #shift ", %%mm5       \n\t"\
00713         "movq %%mm6, %%mm1              \n\t" \
00714         "paddd %%mm3, %%mm6             \n\t" \
00715         "psubd %%mm3, %%mm1             \n\t" \
00716         "psrad $" #shift ", %%mm6       \n\t"\
00717         "psrad $" #shift ", %%mm1       \n\t"\
00718         "packssdw %%mm2, %%mm2          \n\t" \
00719         "packssdw %%mm6, %%mm6          \n\t" \
00720         "movd %%mm2, 32+" #dst "        \n\t"\
00721         "packssdw %%mm1, %%mm1          \n\t" \
00722         "packssdw %%mm5, %%mm5          \n\t" \
00723         "movd %%mm6, 48+" #dst "        \n\t"\
00724         "movd %%mm1, 64+" #dst "        \n\t"\
00725         "movd %%mm5, 80+" #dst "        \n\t"
00726 
00727 
00728 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00729 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00730 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00731 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00732         "jmp 9f                         \n\t"
00733 
00734         "# .p2align 4                   \n\t"\
00735         "6:                             \n\t"
00736 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
00737 
00738 #undef IDCT
00739 #define IDCT(src0, src4, src1, src5, dst, shift) \
00740         "movq " #src0 ", %%mm0          \n\t" \
00741         "movq " #src5 ", %%mm3          \n\t" \
00742         "movq 16(%2), %%mm4             \n\t" \
00743         "pmaddwd %%mm0, %%mm4           \n\t" \
00744         "movq 24(%2), %%mm5             \n\t" \
00745         "pmaddwd %%mm5, %%mm0           \n\t" \
00746         "movq %%mm4, %%mm6              \n\t" \
00747         "movq %%mm0, %%mm5              \n\t" \
00748         "movq 56(%2), %%mm1             \n\t" \
00749         "pmaddwd %%mm3, %%mm1           \n\t" \
00750         "movq 72(%2), %%mm7             \n\t" \
00751         "pmaddwd %%mm3, %%mm7           \n\t" \
00752         "paddd %%mm4, %%mm1             \n\t" \
00753         "paddd %%mm4, %%mm4             \n\t" \
00754         "psubd %%mm1, %%mm4             \n\t" \
00755         "psrad $" #shift ", %%mm1       \n\t"\
00756         "psrad $" #shift ", %%mm4       \n\t"\
00757         "movq %%mm0, %%mm2              \n\t" \
00758         "paddd %%mm7, %%mm0             \n\t" \
00759         "psubd %%mm7, %%mm2             \n\t" \
00760         "psrad $" #shift ", %%mm0       \n\t"\
00761         "psrad $" #shift ", %%mm2       \n\t"\
00762         "packssdw %%mm1, %%mm1          \n\t" \
00763         "movd %%mm1, " #dst "           \n\t"\
00764         "packssdw %%mm0, %%mm0          \n\t" \
00765         "movd %%mm0, 16+" #dst "        \n\t"\
00766         "packssdw %%mm2, %%mm2          \n\t" \
00767         "movd %%mm2, 96+" #dst "        \n\t"\
00768         "packssdw %%mm4, %%mm4          \n\t" \
00769         "movd %%mm4, 112+" #dst "       \n\t"\
00770         "movq 88(%2), %%mm1             \n\t" \
00771         "pmaddwd %%mm3, %%mm1           \n\t" \
00772         "movq %%mm5, %%mm2              \n\t" \
00773         "pmaddwd 104(%2), %%mm3         \n\t" \
00774         "paddd %%mm1, %%mm2             \n\t" \
00775         "psubd %%mm1, %%mm5             \n\t" \
00776         "psrad $" #shift ", %%mm2       \n\t"\
00777         "psrad $" #shift ", %%mm5       \n\t"\
00778         "movq %%mm6, %%mm1              \n\t" \
00779         "paddd %%mm3, %%mm6             \n\t" \
00780         "psubd %%mm3, %%mm1             \n\t" \
00781         "psrad $" #shift ", %%mm6       \n\t"\
00782         "psrad $" #shift ", %%mm1       \n\t"\
00783         "packssdw %%mm2, %%mm2          \n\t" \
00784         "packssdw %%mm6, %%mm6          \n\t" \
00785         "movd %%mm2, 32+" #dst "        \n\t"\
00786         "packssdw %%mm1, %%mm1          \n\t" \
00787         "packssdw %%mm5, %%mm5          \n\t" \
00788         "movd %%mm6, 48+" #dst "        \n\t"\
00789         "movd %%mm1, 64+" #dst "        \n\t"\
00790         "movd %%mm5, 80+" #dst "        \n\t"
00791 
00792 
00793 
00794 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00795 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00796 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00797 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00798         "jmp 9f                         \n\t"
00799 
00800         "# .p2align 4                   \n\t"\
00801         "2:                             \n\t"
00802 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
00803 
00804 #undef IDCT
00805 #define IDCT(src0, src4, src1, src5, dst, shift) \
00806         "movq " #src0 ", %%mm0          \n\t" \
00807         "movq " #src1 ", %%mm2          \n\t" \
00808         "movq " #src5 ", %%mm3          \n\t" \
00809         "movq 16(%2), %%mm4             \n\t" \
00810         "pmaddwd %%mm0, %%mm4           \n\t" \
00811         "movq 24(%2), %%mm5             \n\t" \
00812         "pmaddwd %%mm5, %%mm0           \n\t" \
00813         "movq %%mm4, %%mm6              \n\t" \
00814         "movq 48(%2), %%mm7             \n\t" \
00815         "pmaddwd %%mm2, %%mm7           \n\t" \
00816         "movq %%mm0, %%mm5              \n\t" \
00817         "movq 56(%2), %%mm1             \n\t" \
00818         "pmaddwd %%mm3, %%mm1           \n\t" \
00819         "pmaddwd 64(%2), %%mm2          \n\t" \
00820         "paddd %%mm1, %%mm7             \n\t" \
00821         "movq 72(%2), %%mm1             \n\t" \
00822         "pmaddwd %%mm3, %%mm1           \n\t" \
00823         "paddd %%mm4, %%mm7             \n\t" \
00824         "paddd %%mm4, %%mm4             \n\t" \
00825         "psubd %%mm7, %%mm4             \n\t" \
00826         "paddd %%mm2, %%mm1             \n\t" \
00827         "psrad $" #shift ", %%mm7       \n\t"\
00828         "psrad $" #shift ", %%mm4       \n\t"\
00829         "movq %%mm0, %%mm2              \n\t" \
00830         "paddd %%mm1, %%mm0             \n\t" \
00831         "psubd %%mm1, %%mm2             \n\t" \
00832         "psrad $" #shift ", %%mm0       \n\t"\
00833         "psrad $" #shift ", %%mm2       \n\t"\
00834         "packssdw %%mm7, %%mm7          \n\t" \
00835         "movd %%mm7, " #dst "           \n\t"\
00836         "packssdw %%mm0, %%mm0          \n\t" \
00837         "movd %%mm0, 16+" #dst "        \n\t"\
00838         "packssdw %%mm2, %%mm2          \n\t" \
00839         "movd %%mm2, 96+" #dst "        \n\t"\
00840         "packssdw %%mm4, %%mm4          \n\t" \
00841         "movd %%mm4, 112+" #dst "       \n\t"\
00842         "movq " #src1 ", %%mm0          \n\t" \
00843         "movq 80(%2), %%mm4             \n\t" \
00844         "pmaddwd %%mm0, %%mm4           \n\t" \
00845         "movq 88(%2), %%mm7             \n\t" \
00846         "pmaddwd 96(%2), %%mm0          \n\t" \
00847         "pmaddwd %%mm3, %%mm7           \n\t" \
00848         "movq %%mm5, %%mm2              \n\t" \
00849         "pmaddwd 104(%2), %%mm3         \n\t" \
00850         "paddd %%mm7, %%mm4             \n\t" \
00851         "paddd %%mm4, %%mm2             \n\t" \
00852         "psubd %%mm4, %%mm5             \n\t" \
00853         "psrad $" #shift ", %%mm2       \n\t"\
00854         "psrad $" #shift ", %%mm5       \n\t"\
00855         "movq %%mm6, %%mm4              \n\t" \
00856         "paddd %%mm0, %%mm3             \n\t" \
00857         "paddd %%mm3, %%mm6             \n\t" \
00858         "psubd %%mm3, %%mm4             \n\t" \
00859         "psrad $" #shift ", %%mm6       \n\t"\
00860         "psrad $" #shift ", %%mm4       \n\t"\
00861         "packssdw %%mm2, %%mm2          \n\t" \
00862         "packssdw %%mm6, %%mm6          \n\t" \
00863         "movd %%mm2, 32+" #dst "        \n\t"\
00864         "packssdw %%mm4, %%mm4          \n\t" \
00865         "packssdw %%mm5, %%mm5          \n\t" \
00866         "movd %%mm6, 48+" #dst "        \n\t"\
00867         "movd %%mm4, 64+" #dst "        \n\t"\
00868         "movd %%mm5, 80+" #dst "        \n\t"
00869 
00870 
00871 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00872 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00873 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00874 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00875         "jmp 9f                         \n\t"
00876 
00877         "# .p2align 4                   \n\t"\
00878         "3:                             \n\t"
00879 #undef IDCT
00880 #define IDCT(src0, src4, src1, src5, dst, shift) \
00881         "movq " #src0 ", %%mm0          \n\t" \
00882         "movq " #src1 ", %%mm2          \n\t" \
00883         "movq 16(%2), %%mm4             \n\t" \
00884         "pmaddwd %%mm0, %%mm4           \n\t" \
00885         "movq 24(%2), %%mm5             \n\t" \
00886         "pmaddwd %%mm5, %%mm0           \n\t" \
00887         "movq %%mm4, %%mm6              \n\t" \
00888         "movq 48(%2), %%mm7             \n\t" \
00889         "pmaddwd %%mm2, %%mm7           \n\t" \
00890         "movq %%mm0, %%mm5              \n\t" \
00891         "movq 64(%2), %%mm3             \n\t"\
00892         "pmaddwd %%mm2, %%mm3           \n\t" \
00893         "paddd %%mm4, %%mm7             \n\t" \
00894         "paddd %%mm4, %%mm4             \n\t" \
00895         "psubd %%mm7, %%mm4             \n\t" \
00896         "psrad $" #shift ", %%mm7       \n\t"\
00897         "psrad $" #shift ", %%mm4       \n\t"\
00898         "movq %%mm0, %%mm1              \n\t" \
00899         "paddd %%mm3, %%mm0             \n\t" \
00900         "psubd %%mm3, %%mm1             \n\t" \
00901         "psrad $" #shift ", %%mm0       \n\t"\
00902         "psrad $" #shift ", %%mm1       \n\t"\
00903         "packssdw %%mm7, %%mm7          \n\t" \
00904         "movd %%mm7, " #dst "           \n\t"\
00905         "packssdw %%mm0, %%mm0          \n\t" \
00906         "movd %%mm0, 16+" #dst "        \n\t"\
00907         "packssdw %%mm1, %%mm1          \n\t" \
00908         "movd %%mm1, 96+" #dst "        \n\t"\
00909         "packssdw %%mm4, %%mm4          \n\t" \
00910         "movd %%mm4, 112+" #dst "       \n\t"\
00911         "movq 80(%2), %%mm4             \n\t" \
00912         "pmaddwd %%mm2, %%mm4           \n\t" \
00913         "pmaddwd 96(%2), %%mm2          \n\t" \
00914         "movq %%mm5, %%mm1              \n\t" \
00915         "paddd %%mm4, %%mm1             \n\t" \
00916         "psubd %%mm4, %%mm5             \n\t" \
00917         "psrad $" #shift ", %%mm1       \n\t"\
00918         "psrad $" #shift ", %%mm5       \n\t"\
00919         "movq %%mm6, %%mm4              \n\t" \
00920         "paddd %%mm2, %%mm6             \n\t" \
00921         "psubd %%mm2, %%mm4             \n\t" \
00922         "psrad $" #shift ", %%mm6       \n\t"\
00923         "psrad $" #shift ", %%mm4       \n\t"\
00924         "packssdw %%mm1, %%mm1          \n\t" \
00925         "packssdw %%mm6, %%mm6          \n\t" \
00926         "movd %%mm1, 32+" #dst "        \n\t"\
00927         "packssdw %%mm4, %%mm4          \n\t" \
00928         "packssdw %%mm5, %%mm5          \n\t" \
00929         "movd %%mm6, 48+" #dst "        \n\t"\
00930         "movd %%mm4, 64+" #dst "        \n\t"\
00931         "movd %%mm5, 80+" #dst "        \n\t"
00932 
00933 
00934 
00935 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
00936 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
00937 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
00938 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00939         "jmp 9f                         \n\t"
00940 
00941         "# .p2align 4                   \n\t"\
00942         "5:                             \n\t"
00943 #undef IDCT
00944 #define IDCT(src0, src4, src1, src5, dst, shift) \
00945         "movq " #src0 ", %%mm0          \n\t" \
00946         "movq " #src4 ", %%mm1          \n\t" \
00947         "movq 16(%2), %%mm4             \n\t" \
00948         "pmaddwd %%mm0, %%mm4           \n\t" \
00949         "movq 24(%2), %%mm5             \n\t" \
00950         "pmaddwd %%mm5, %%mm0           \n\t" \
00951         "movq 32(%2), %%mm5             \n\t" \
00952         "pmaddwd %%mm1, %%mm5           \n\t" \
00953         "movq 40(%2), %%mm6             \n\t" \
00954         "pmaddwd %%mm6, %%mm1           \n\t" \
00955         "movq %%mm4, %%mm6              \n\t" \
00956         "paddd %%mm5, %%mm4             \n\t" \
00957         "psubd %%mm5, %%mm6             \n\t" \
00958         "movq %%mm0, %%mm5              \n\t" \
00959         "paddd %%mm1, %%mm0             \n\t" \
00960         "psubd %%mm1, %%mm5             \n\t" \
00961         "movq 8+" #src0 ", %%mm2        \n\t" \
00962         "movq 8+" #src4 ", %%mm3        \n\t" \
00963         "movq 16(%2), %%mm1             \n\t" \
00964         "pmaddwd %%mm2, %%mm1           \n\t" \
00965         "movq 24(%2), %%mm7             \n\t" \
00966         "pmaddwd %%mm7, %%mm2           \n\t" \
00967         "movq 32(%2), %%mm7             \n\t" \
00968         "pmaddwd %%mm3, %%mm7           \n\t" \
00969         "pmaddwd 40(%2), %%mm3          \n\t" \
00970         "paddd %%mm1, %%mm7             \n\t" \
00971         "paddd %%mm1, %%mm1             \n\t" \
00972         "psubd %%mm7, %%mm1             \n\t" \
00973         "paddd %%mm2, %%mm3             \n\t" \
00974         "paddd %%mm2, %%mm2             \n\t" \
00975         "psubd %%mm3, %%mm2             \n\t" \
00976         "psrad $" #shift ", %%mm4       \n\t"\
00977         "psrad $" #shift ", %%mm7       \n\t"\
00978         "psrad $" #shift ", %%mm3       \n\t"\
00979         "packssdw %%mm7, %%mm4          \n\t" \
00980         "movq %%mm4, " #dst "           \n\t"\
00981         "psrad $" #shift ", %%mm0       \n\t"\
00982         "packssdw %%mm3, %%mm0          \n\t" \
00983         "movq %%mm0, 16+" #dst "        \n\t"\
00984         "movq %%mm0, 96+" #dst "        \n\t"\
00985         "movq %%mm4, 112+" #dst "       \n\t"\
00986         "psrad $" #shift ", %%mm5       \n\t"\
00987         "psrad $" #shift ", %%mm6       \n\t"\
00988         "psrad $" #shift ", %%mm2       \n\t"\
00989         "packssdw %%mm2, %%mm5          \n\t" \
00990         "movq %%mm5, 32+" #dst "        \n\t"\
00991         "psrad $" #shift ", %%mm1       \n\t"\
00992         "packssdw %%mm1, %%mm6          \n\t" \
00993         "movq %%mm6, 48+" #dst "        \n\t"\
00994         "movq %%mm6, 64+" #dst "        \n\t"\
00995         "movq %%mm5, 80+" #dst "        \n\t"
00996 
00997 
00998 
00999 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01000 
01001 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01002 
01003         "jmp 9f                         \n\t"
01004 
01005 
01006         "# .p2align 4                   \n\t"\
01007         "1:                             \n\t"
01008 #undef IDCT
01009 #define IDCT(src0, src4, src1, src5, dst, shift) \
01010         "movq " #src0 ", %%mm0          \n\t" \
01011         "movq " #src4 ", %%mm1          \n\t" \
01012         "movq " #src1 ", %%mm2          \n\t" \
01013         "movq 16(%2), %%mm4             \n\t" \
01014         "pmaddwd %%mm0, %%mm4           \n\t" \
01015         "movq 24(%2), %%mm5             \n\t" \
01016         "pmaddwd %%mm5, %%mm0           \n\t" \
01017         "movq 32(%2), %%mm5             \n\t" \
01018         "pmaddwd %%mm1, %%mm5           \n\t" \
01019         "movq 40(%2), %%mm6             \n\t" \
01020         "pmaddwd %%mm6, %%mm1           \n\t" \
01021         "movq %%mm4, %%mm6              \n\t" \
01022         "movq 48(%2), %%mm7             \n\t" \
01023         "pmaddwd %%mm2, %%mm7           \n\t" \
01024         "paddd %%mm5, %%mm4             \n\t" \
01025         "psubd %%mm5, %%mm6             \n\t" \
01026         "movq %%mm0, %%mm5              \n\t" \
01027         "paddd %%mm1, %%mm0             \n\t" \
01028         "psubd %%mm1, %%mm5             \n\t" \
01029         "movq 64(%2), %%mm1             \n\t"\
01030         "pmaddwd %%mm2, %%mm1           \n\t" \
01031         "paddd %%mm4, %%mm7             \n\t" \
01032         "paddd %%mm4, %%mm4             \n\t" \
01033         "psubd %%mm7, %%mm4             \n\t" \
01034         "psrad $" #shift ", %%mm7       \n\t"\
01035         "psrad $" #shift ", %%mm4       \n\t"\
01036         "movq %%mm0, %%mm3              \n\t" \
01037         "paddd %%mm1, %%mm0             \n\t" \
01038         "psubd %%mm1, %%mm3             \n\t" \
01039         "psrad $" #shift ", %%mm0       \n\t"\
01040         "psrad $" #shift ", %%mm3       \n\t"\
01041         "packssdw %%mm7, %%mm7          \n\t" \
01042         "movd %%mm7, " #dst "           \n\t"\
01043         "packssdw %%mm0, %%mm0          \n\t" \
01044         "movd %%mm0, 16+" #dst "        \n\t"\
01045         "packssdw %%mm3, %%mm3          \n\t" \
01046         "movd %%mm3, 96+" #dst "        \n\t"\
01047         "packssdw %%mm4, %%mm4          \n\t" \
01048         "movd %%mm4, 112+" #dst "       \n\t"\
01049         "movq 80(%2), %%mm4             \n\t" \
01050         "pmaddwd %%mm2, %%mm4           \n\t" \
01051         "pmaddwd 96(%2), %%mm2          \n\t" \
01052         "movq %%mm5, %%mm3              \n\t" \
01053         "paddd %%mm4, %%mm3             \n\t" \
01054         "psubd %%mm4, %%mm5             \n\t" \
01055         "psrad $" #shift ", %%mm3       \n\t"\
01056         "psrad $" #shift ", %%mm5       \n\t"\
01057         "movq %%mm6, %%mm4              \n\t" \
01058         "paddd %%mm2, %%mm6             \n\t" \
01059         "psubd %%mm2, %%mm4             \n\t" \
01060         "psrad $" #shift ", %%mm6       \n\t"\
01061         "packssdw %%mm3, %%mm3          \n\t" \
01062         "movd %%mm3, 32+" #dst "        \n\t"\
01063         "psrad $" #shift ", %%mm4       \n\t"\
01064         "packssdw %%mm6, %%mm6          \n\t" \
01065         "movd %%mm6, 48+" #dst "        \n\t"\
01066         "packssdw %%mm4, %%mm4          \n\t" \
01067         "packssdw %%mm5, %%mm5          \n\t" \
01068         "movd %%mm4, 64+" #dst "        \n\t"\
01069         "movd %%mm5, 80+" #dst "        \n\t"
01070 
01071 
01072 
01073 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01074 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
01075 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01076 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01077         "jmp 9f                         \n\t"
01078 
01079 
01080         "# .p2align 4                   \n\t"
01081         "7:                             \n\t"
01082 #undef IDCT
01083 #define IDCT(src0, src4, src1, src5, dst, shift) \
01084         "movq " #src0 ", %%mm0          \n\t" \
01085         "movq 16(%2), %%mm4             \n\t" \
01086         "pmaddwd %%mm0, %%mm4           \n\t" \
01087         "movq 24(%2), %%mm5             \n\t" \
01088         "pmaddwd %%mm5, %%mm0           \n\t" \
01089         "psrad $" #shift ", %%mm4       \n\t"\
01090         "psrad $" #shift ", %%mm0       \n\t"\
01091         "movq 8+" #src0 ", %%mm2        \n\t" \
01092         "movq 16(%2), %%mm1             \n\t" \
01093         "pmaddwd %%mm2, %%mm1           \n\t" \
01094         "movq 24(%2), %%mm7             \n\t" \
01095         "pmaddwd %%mm7, %%mm2           \n\t" \
01096         "movq 32(%2), %%mm7             \n\t" \
01097         "psrad $" #shift ", %%mm1       \n\t"\
01098         "packssdw %%mm1, %%mm4          \n\t" \
01099         "movq %%mm4, " #dst "           \n\t"\
01100         "psrad $" #shift ", %%mm2       \n\t"\
01101         "packssdw %%mm2, %%mm0          \n\t" \
01102         "movq %%mm0, 16+" #dst "        \n\t"\
01103         "movq %%mm0, 96+" #dst "        \n\t"\
01104         "movq %%mm4, 112+" #dst "       \n\t"\
01105         "movq %%mm0, 32+" #dst "        \n\t"\
01106         "movq %%mm4, 48+" #dst "        \n\t"\
01107         "movq %%mm4, 64+" #dst "        \n\t"\
01108         "movq %%mm0, 80+" #dst "        \n\t"
01109 
01110 
01111 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
01112 
01113 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
01114 
01115 
01116 
01117 #endif
01118 
01119 
01120 
01121 
01122 
01123 
01124 
01125 
01126 
01127 
01128 
01129 
01130 
01131 
01132 
01133 
01134 
01135 
01136 
01137 
01138 
01139 
01140 
01141 "9: \n\t"
01142                 :: "r" (block), "r" (temp), "r" (coeffs)
01143                 : "%eax"
01144         );
01145 }
01146 
01147 void ff_simple_idct_mmx(int16_t *block)
01148 {
01149     idct(block);
01150 }
01151 
01152 
01153 
01154 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01155 {
01156     idct(block);
01157     ff_put_pixels_clamped_mmx(block, dest, line_size);
01158 }
01159 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01160 {
01161     idct(block);
01162     ff_add_pixels_clamped_mmx(block, dest, line_size);
01163 }