00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86/asm.h"
00027 #include "libavutil/x86/cpu.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/mathops.h"
00031 #include "dsputil_mmx.h"
00032 
00033 
00034 #if HAVE_INLINE_ASM
00035 
00036 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00037 {
00038     __asm__ volatile(
00039         "mov $-128, %%"REG_a"           \n\t"
00040         "pxor %%mm7, %%mm7              \n\t"
00041         ".p2align 4                     \n\t"
00042         "1:                             \n\t"
00043         "movq (%0), %%mm0               \n\t"
00044         "movq (%0, %2), %%mm2           \n\t"
00045         "movq %%mm0, %%mm1              \n\t"
00046         "movq %%mm2, %%mm3              \n\t"
00047         "punpcklbw %%mm7, %%mm0         \n\t"
00048         "punpckhbw %%mm7, %%mm1         \n\t"
00049         "punpcklbw %%mm7, %%mm2         \n\t"
00050         "punpckhbw %%mm7, %%mm3         \n\t"
00051         "movq %%mm0, (%1, %%"REG_a")    \n\t"
00052         "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
00053         "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
00054         "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
00055         "add %3, %0                     \n\t"
00056         "add $32, %%"REG_a"             \n\t"
00057         "js 1b                          \n\t"
00058         : "+r" (pixels)
00059         : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
00060         : "%"REG_a
00061     );
00062 }
00063 
00064 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
00065 {
00066     __asm__ volatile(
00067         "pxor %%xmm4,      %%xmm4         \n\t"
00068         "movq (%0),        %%xmm0         \n\t"
00069         "movq (%0, %2),    %%xmm1         \n\t"
00070         "movq (%0, %2,2),  %%xmm2         \n\t"
00071         "movq (%0, %3),    %%xmm3         \n\t"
00072         "lea (%0,%2,4), %0                \n\t"
00073         "punpcklbw %%xmm4, %%xmm0         \n\t"
00074         "punpcklbw %%xmm4, %%xmm1         \n\t"
00075         "punpcklbw %%xmm4, %%xmm2         \n\t"
00076         "punpcklbw %%xmm4, %%xmm3         \n\t"
00077         "movdqa %%xmm0,      (%1)         \n\t"
00078         "movdqa %%xmm1,    16(%1)         \n\t"
00079         "movdqa %%xmm2,    32(%1)         \n\t"
00080         "movdqa %%xmm3,    48(%1)         \n\t"
00081         "movq (%0),        %%xmm0         \n\t"
00082         "movq (%0, %2),    %%xmm1         \n\t"
00083         "movq (%0, %2,2),  %%xmm2         \n\t"
00084         "movq (%0, %3),    %%xmm3         \n\t"
00085         "punpcklbw %%xmm4, %%xmm0         \n\t"
00086         "punpcklbw %%xmm4, %%xmm1         \n\t"
00087         "punpcklbw %%xmm4, %%xmm2         \n\t"
00088         "punpcklbw %%xmm4, %%xmm3         \n\t"
00089         "movdqa %%xmm0,    64(%1)         \n\t"
00090         "movdqa %%xmm1,    80(%1)         \n\t"
00091         "movdqa %%xmm2,    96(%1)         \n\t"
00092         "movdqa %%xmm3,   112(%1)         \n\t"
00093         : "+r" (pixels)
00094         : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
00095     );
00096 }
00097 
00098 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00099 {
00100     __asm__ volatile(
00101         "pxor %%mm7, %%mm7              \n\t"
00102         "mov $-128, %%"REG_a"           \n\t"
00103         ".p2align 4                     \n\t"
00104         "1:                             \n\t"
00105         "movq (%0), %%mm0               \n\t"
00106         "movq (%1), %%mm2               \n\t"
00107         "movq %%mm0, %%mm1              \n\t"
00108         "movq %%mm2, %%mm3              \n\t"
00109         "punpcklbw %%mm7, %%mm0         \n\t"
00110         "punpckhbw %%mm7, %%mm1         \n\t"
00111         "punpcklbw %%mm7, %%mm2         \n\t"
00112         "punpckhbw %%mm7, %%mm3         \n\t"
00113         "psubw %%mm2, %%mm0             \n\t"
00114         "psubw %%mm3, %%mm1             \n\t"
00115         "movq %%mm0, (%2, %%"REG_a")    \n\t"
00116         "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
00117         "add %3, %0                     \n\t"
00118         "add %3, %1                     \n\t"
00119         "add $16, %%"REG_a"             \n\t"
00120         "jnz 1b                         \n\t"
00121         : "+r" (s1), "+r" (s2)
00122         : "r" (block+64), "r" ((x86_reg)stride)
00123         : "%"REG_a
00124     );
00125 }
00126 
00127 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00128     const int h=16;
00129     int sum;
00130     x86_reg index= -line_size*h;
00131 
00132     __asm__ volatile(
00133                 "pxor %%mm7, %%mm7              \n\t"
00134                 "pxor %%mm6, %%mm6              \n\t"
00135                 "1:                             \n\t"
00136                 "movq (%2, %1), %%mm0           \n\t"
00137                 "movq (%2, %1), %%mm1           \n\t"
00138                 "movq 8(%2, %1), %%mm2          \n\t"
00139                 "movq 8(%2, %1), %%mm3          \n\t"
00140                 "punpcklbw %%mm7, %%mm0         \n\t"
00141                 "punpckhbw %%mm7, %%mm1         \n\t"
00142                 "punpcklbw %%mm7, %%mm2         \n\t"
00143                 "punpckhbw %%mm7, %%mm3         \n\t"
00144                 "paddw %%mm0, %%mm1             \n\t"
00145                 "paddw %%mm2, %%mm3             \n\t"
00146                 "paddw %%mm1, %%mm3             \n\t"
00147                 "paddw %%mm3, %%mm6             \n\t"
00148                 "add %3, %1                     \n\t"
00149                 " js 1b                         \n\t"
00150                 "movq %%mm6, %%mm5              \n\t"
00151                 "psrlq $32, %%mm6               \n\t"
00152                 "paddw %%mm5, %%mm6             \n\t"
00153                 "movq %%mm6, %%mm5              \n\t"
00154                 "psrlq $16, %%mm6               \n\t"
00155                 "paddw %%mm5, %%mm6             \n\t"
00156                 "movd %%mm6, %0                 \n\t"
00157                 "andl $0xFFFF, %0               \n\t"
00158                 : "=&r" (sum), "+r" (index)
00159                 : "r" (pix - index), "r" ((x86_reg)line_size)
00160         );
00161 
00162         return sum;
00163 }
00164 
00165 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00166     int tmp;
00167   __asm__ volatile (
00168       "movl $16,%%ecx\n"
00169       "pxor %%mm0,%%mm0\n"
00170       "pxor %%mm7,%%mm7\n"
00171       "1:\n"
00172       "movq (%0),%%mm2\n"       
00173       "movq 8(%0),%%mm3\n"      
00174 
00175       "movq %%mm2,%%mm1\n"      
00176 
00177       "punpckhbw %%mm0,%%mm1\n" 
00178       "punpcklbw %%mm0,%%mm2\n" 
00179 
00180       "movq %%mm3,%%mm4\n"      
00181       "punpckhbw %%mm0,%%mm3\n" 
00182       "punpcklbw %%mm0,%%mm4\n" 
00183 
00184       "pmaddwd %%mm1,%%mm1\n"   
00185       "pmaddwd %%mm2,%%mm2\n"   
00186 
00187       "pmaddwd %%mm3,%%mm3\n"
00188       "pmaddwd %%mm4,%%mm4\n"
00189 
00190       "paddd %%mm1,%%mm2\n"     
00191 
00192       "paddd %%mm3,%%mm4\n"
00193       "paddd %%mm2,%%mm7\n"
00194 
00195       "add %2, %0\n"
00196       "paddd %%mm4,%%mm7\n"
00197       "dec %%ecx\n"
00198       "jnz 1b\n"
00199 
00200       "movq %%mm7,%%mm1\n"
00201       "psrlq $32, %%mm7\n"      
00202       "paddd %%mm7,%%mm1\n"
00203       "movd %%mm1,%1\n"
00204       : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
00205     return tmp;
00206 }
00207 
00208 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00209     int tmp;
00210   __asm__ volatile (
00211       "movl %4,%%ecx\n"
00212       "shr $1,%%ecx\n"
00213       "pxor %%mm0,%%mm0\n"      
00214       "pxor %%mm7,%%mm7\n"      
00215       "1:\n"
00216       "movq (%0),%%mm1\n"       
00217       "movq (%1),%%mm2\n"       
00218       "movq (%0,%3),%%mm3\n"    
00219       "movq (%1,%3),%%mm4\n"    
00220 
00221       
00222       
00223       
00224       "movq %%mm1,%%mm5\n"
00225       "movq %%mm3,%%mm6\n"
00226       "psubusb %%mm2,%%mm1\n"
00227       "psubusb %%mm4,%%mm3\n"
00228       "psubusb %%mm5,%%mm2\n"
00229       "psubusb %%mm6,%%mm4\n"
00230 
00231       "por %%mm1,%%mm2\n"
00232       "por %%mm3,%%mm4\n"
00233 
00234       
00235       "movq %%mm2,%%mm1\n"
00236       "movq %%mm4,%%mm3\n"
00237 
00238       "punpckhbw %%mm0,%%mm2\n"
00239       "punpckhbw %%mm0,%%mm4\n"
00240       "punpcklbw %%mm0,%%mm1\n" 
00241       "punpcklbw %%mm0,%%mm3\n" 
00242 
00243       "pmaddwd %%mm2,%%mm2\n"
00244       "pmaddwd %%mm4,%%mm4\n"
00245       "pmaddwd %%mm1,%%mm1\n"
00246       "pmaddwd %%mm3,%%mm3\n"
00247 
00248       "lea (%0,%3,2), %0\n"     
00249       "lea (%1,%3,2), %1\n"     
00250 
00251       "paddd %%mm2,%%mm1\n"
00252       "paddd %%mm4,%%mm3\n"
00253       "paddd %%mm1,%%mm7\n"
00254       "paddd %%mm3,%%mm7\n"
00255 
00256       "decl %%ecx\n"
00257       "jnz 1b\n"
00258 
00259       "movq %%mm7,%%mm1\n"
00260       "psrlq $32, %%mm7\n"      
00261       "paddd %%mm7,%%mm1\n"
00262       "movd %%mm1,%2\n"
00263       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00264       : "r" ((x86_reg)line_size) , "m" (h)
00265       : "%ecx");
00266     return tmp;
00267 }
00268 
00269 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00270     int tmp;
00271   __asm__ volatile (
00272       "movl %4,%%ecx\n"
00273       "pxor %%mm0,%%mm0\n"      
00274       "pxor %%mm7,%%mm7\n"      
00275       "1:\n"
00276       "movq (%0),%%mm1\n"       
00277       "movq (%1),%%mm2\n"       
00278       "movq 8(%0),%%mm3\n"      
00279       "movq 8(%1),%%mm4\n"      
00280 
00281       
00282       
00283       
00284       "movq %%mm1,%%mm5\n"
00285       "movq %%mm3,%%mm6\n"
00286       "psubusb %%mm2,%%mm1\n"
00287       "psubusb %%mm4,%%mm3\n"
00288       "psubusb %%mm5,%%mm2\n"
00289       "psubusb %%mm6,%%mm4\n"
00290 
00291       "por %%mm1,%%mm2\n"
00292       "por %%mm3,%%mm4\n"
00293 
00294       
00295       "movq %%mm2,%%mm1\n"
00296       "movq %%mm4,%%mm3\n"
00297 
00298       "punpckhbw %%mm0,%%mm2\n"
00299       "punpckhbw %%mm0,%%mm4\n"
00300       "punpcklbw %%mm0,%%mm1\n" 
00301       "punpcklbw %%mm0,%%mm3\n" 
00302 
00303       "pmaddwd %%mm2,%%mm2\n"
00304       "pmaddwd %%mm4,%%mm4\n"
00305       "pmaddwd %%mm1,%%mm1\n"
00306       "pmaddwd %%mm3,%%mm3\n"
00307 
00308       "add %3,%0\n"
00309       "add %3,%1\n"
00310 
00311       "paddd %%mm2,%%mm1\n"
00312       "paddd %%mm4,%%mm3\n"
00313       "paddd %%mm1,%%mm7\n"
00314       "paddd %%mm3,%%mm7\n"
00315 
00316       "decl %%ecx\n"
00317       "jnz 1b\n"
00318 
00319       "movq %%mm7,%%mm1\n"
00320       "psrlq $32, %%mm7\n"      
00321       "paddd %%mm7,%%mm1\n"
00322       "movd %%mm1,%2\n"
00323       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00324       : "r" ((x86_reg)line_size) , "m" (h)
00325       : "%ecx");
00326     return tmp;
00327 }
00328 
00329 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00330     int tmp;
00331   __asm__ volatile (
00332       "movl %3,%%ecx\n"
00333       "pxor %%mm7,%%mm7\n"
00334       "pxor %%mm6,%%mm6\n"
00335 
00336       "movq (%0),%%mm0\n"
00337       "movq %%mm0, %%mm1\n"
00338       "psllq $8, %%mm0\n"
00339       "psrlq $8, %%mm1\n"
00340       "psrlq $8, %%mm0\n"
00341       "movq %%mm0, %%mm2\n"
00342       "movq %%mm1, %%mm3\n"
00343       "punpcklbw %%mm7,%%mm0\n"
00344       "punpcklbw %%mm7,%%mm1\n"
00345       "punpckhbw %%mm7,%%mm2\n"
00346       "punpckhbw %%mm7,%%mm3\n"
00347       "psubw %%mm1, %%mm0\n"
00348       "psubw %%mm3, %%mm2\n"
00349 
00350       "add %2,%0\n"
00351 
00352       "movq (%0),%%mm4\n"
00353       "movq %%mm4, %%mm1\n"
00354       "psllq $8, %%mm4\n"
00355       "psrlq $8, %%mm1\n"
00356       "psrlq $8, %%mm4\n"
00357       "movq %%mm4, %%mm5\n"
00358       "movq %%mm1, %%mm3\n"
00359       "punpcklbw %%mm7,%%mm4\n"
00360       "punpcklbw %%mm7,%%mm1\n"
00361       "punpckhbw %%mm7,%%mm5\n"
00362       "punpckhbw %%mm7,%%mm3\n"
00363       "psubw %%mm1, %%mm4\n"
00364       "psubw %%mm3, %%mm5\n"
00365       "psubw %%mm4, %%mm0\n"
00366       "psubw %%mm5, %%mm2\n"
00367       "pxor %%mm3, %%mm3\n"
00368       "pxor %%mm1, %%mm1\n"
00369       "pcmpgtw %%mm0, %%mm3\n\t"
00370       "pcmpgtw %%mm2, %%mm1\n\t"
00371       "pxor %%mm3, %%mm0\n"
00372       "pxor %%mm1, %%mm2\n"
00373       "psubw %%mm3, %%mm0\n"
00374       "psubw %%mm1, %%mm2\n"
00375       "paddw %%mm0, %%mm2\n"
00376       "paddw %%mm2, %%mm6\n"
00377 
00378       "add %2,%0\n"
00379       "1:\n"
00380 
00381       "movq (%0),%%mm0\n"
00382       "movq %%mm0, %%mm1\n"
00383       "psllq $8, %%mm0\n"
00384       "psrlq $8, %%mm1\n"
00385       "psrlq $8, %%mm0\n"
00386       "movq %%mm0, %%mm2\n"
00387       "movq %%mm1, %%mm3\n"
00388       "punpcklbw %%mm7,%%mm0\n"
00389       "punpcklbw %%mm7,%%mm1\n"
00390       "punpckhbw %%mm7,%%mm2\n"
00391       "punpckhbw %%mm7,%%mm3\n"
00392       "psubw %%mm1, %%mm0\n"
00393       "psubw %%mm3, %%mm2\n"
00394       "psubw %%mm0, %%mm4\n"
00395       "psubw %%mm2, %%mm5\n"
00396       "pxor %%mm3, %%mm3\n"
00397       "pxor %%mm1, %%mm1\n"
00398       "pcmpgtw %%mm4, %%mm3\n\t"
00399       "pcmpgtw %%mm5, %%mm1\n\t"
00400       "pxor %%mm3, %%mm4\n"
00401       "pxor %%mm1, %%mm5\n"
00402       "psubw %%mm3, %%mm4\n"
00403       "psubw %%mm1, %%mm5\n"
00404       "paddw %%mm4, %%mm5\n"
00405       "paddw %%mm5, %%mm6\n"
00406 
00407       "add %2,%0\n"
00408 
00409       "movq (%0),%%mm4\n"
00410       "movq %%mm4, %%mm1\n"
00411       "psllq $8, %%mm4\n"
00412       "psrlq $8, %%mm1\n"
00413       "psrlq $8, %%mm4\n"
00414       "movq %%mm4, %%mm5\n"
00415       "movq %%mm1, %%mm3\n"
00416       "punpcklbw %%mm7,%%mm4\n"
00417       "punpcklbw %%mm7,%%mm1\n"
00418       "punpckhbw %%mm7,%%mm5\n"
00419       "punpckhbw %%mm7,%%mm3\n"
00420       "psubw %%mm1, %%mm4\n"
00421       "psubw %%mm3, %%mm5\n"
00422       "psubw %%mm4, %%mm0\n"
00423       "psubw %%mm5, %%mm2\n"
00424       "pxor %%mm3, %%mm3\n"
00425       "pxor %%mm1, %%mm1\n"
00426       "pcmpgtw %%mm0, %%mm3\n\t"
00427       "pcmpgtw %%mm2, %%mm1\n\t"
00428       "pxor %%mm3, %%mm0\n"
00429       "pxor %%mm1, %%mm2\n"
00430       "psubw %%mm3, %%mm0\n"
00431       "psubw %%mm1, %%mm2\n"
00432       "paddw %%mm0, %%mm2\n"
00433       "paddw %%mm2, %%mm6\n"
00434 
00435       "add %2,%0\n"
00436       "subl $2, %%ecx\n"
00437       " jnz 1b\n"
00438 
00439       "movq %%mm6, %%mm0\n"
00440       "punpcklwd %%mm7,%%mm0\n"
00441       "punpckhwd %%mm7,%%mm6\n"
00442       "paddd %%mm0, %%mm6\n"
00443 
00444       "movq %%mm6,%%mm0\n"
00445       "psrlq $32, %%mm6\n"
00446       "paddd %%mm6,%%mm0\n"
00447       "movd %%mm0,%1\n"
00448       : "+r" (pix1), "=r"(tmp)
00449       : "r" ((x86_reg)line_size) , "g" (h-2)
00450       : "%ecx");
00451       return tmp;
00452 }
00453 
00454 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
00455     int tmp;
00456     uint8_t * pix= pix1;
00457   __asm__ volatile (
00458       "movl %3,%%ecx\n"
00459       "pxor %%mm7,%%mm7\n"
00460       "pxor %%mm6,%%mm6\n"
00461 
00462       "movq (%0),%%mm0\n"
00463       "movq 1(%0),%%mm1\n"
00464       "movq %%mm0, %%mm2\n"
00465       "movq %%mm1, %%mm3\n"
00466       "punpcklbw %%mm7,%%mm0\n"
00467       "punpcklbw %%mm7,%%mm1\n"
00468       "punpckhbw %%mm7,%%mm2\n"
00469       "punpckhbw %%mm7,%%mm3\n"
00470       "psubw %%mm1, %%mm0\n"
00471       "psubw %%mm3, %%mm2\n"
00472 
00473       "add %2,%0\n"
00474 
00475       "movq (%0),%%mm4\n"
00476       "movq 1(%0),%%mm1\n"
00477       "movq %%mm4, %%mm5\n"
00478       "movq %%mm1, %%mm3\n"
00479       "punpcklbw %%mm7,%%mm4\n"
00480       "punpcklbw %%mm7,%%mm1\n"
00481       "punpckhbw %%mm7,%%mm5\n"
00482       "punpckhbw %%mm7,%%mm3\n"
00483       "psubw %%mm1, %%mm4\n"
00484       "psubw %%mm3, %%mm5\n"
00485       "psubw %%mm4, %%mm0\n"
00486       "psubw %%mm5, %%mm2\n"
00487       "pxor %%mm3, %%mm3\n"
00488       "pxor %%mm1, %%mm1\n"
00489       "pcmpgtw %%mm0, %%mm3\n\t"
00490       "pcmpgtw %%mm2, %%mm1\n\t"
00491       "pxor %%mm3, %%mm0\n"
00492       "pxor %%mm1, %%mm2\n"
00493       "psubw %%mm3, %%mm0\n"
00494       "psubw %%mm1, %%mm2\n"
00495       "paddw %%mm0, %%mm2\n"
00496       "paddw %%mm2, %%mm6\n"
00497 
00498       "add %2,%0\n"
00499       "1:\n"
00500 
00501       "movq (%0),%%mm0\n"
00502       "movq 1(%0),%%mm1\n"
00503       "movq %%mm0, %%mm2\n"
00504       "movq %%mm1, %%mm3\n"
00505       "punpcklbw %%mm7,%%mm0\n"
00506       "punpcklbw %%mm7,%%mm1\n"
00507       "punpckhbw %%mm7,%%mm2\n"
00508       "punpckhbw %%mm7,%%mm3\n"
00509       "psubw %%mm1, %%mm0\n"
00510       "psubw %%mm3, %%mm2\n"
00511       "psubw %%mm0, %%mm4\n"
00512       "psubw %%mm2, %%mm5\n"
00513       "pxor %%mm3, %%mm3\n"
00514       "pxor %%mm1, %%mm1\n"
00515       "pcmpgtw %%mm4, %%mm3\n\t"
00516       "pcmpgtw %%mm5, %%mm1\n\t"
00517       "pxor %%mm3, %%mm4\n"
00518       "pxor %%mm1, %%mm5\n"
00519       "psubw %%mm3, %%mm4\n"
00520       "psubw %%mm1, %%mm5\n"
00521       "paddw %%mm4, %%mm5\n"
00522       "paddw %%mm5, %%mm6\n"
00523 
00524       "add %2,%0\n"
00525 
00526       "movq (%0),%%mm4\n"
00527       "movq 1(%0),%%mm1\n"
00528       "movq %%mm4, %%mm5\n"
00529       "movq %%mm1, %%mm3\n"
00530       "punpcklbw %%mm7,%%mm4\n"
00531       "punpcklbw %%mm7,%%mm1\n"
00532       "punpckhbw %%mm7,%%mm5\n"
00533       "punpckhbw %%mm7,%%mm3\n"
00534       "psubw %%mm1, %%mm4\n"
00535       "psubw %%mm3, %%mm5\n"
00536       "psubw %%mm4, %%mm0\n"
00537       "psubw %%mm5, %%mm2\n"
00538       "pxor %%mm3, %%mm3\n"
00539       "pxor %%mm1, %%mm1\n"
00540       "pcmpgtw %%mm0, %%mm3\n\t"
00541       "pcmpgtw %%mm2, %%mm1\n\t"
00542       "pxor %%mm3, %%mm0\n"
00543       "pxor %%mm1, %%mm2\n"
00544       "psubw %%mm3, %%mm0\n"
00545       "psubw %%mm1, %%mm2\n"
00546       "paddw %%mm0, %%mm2\n"
00547       "paddw %%mm2, %%mm6\n"
00548 
00549       "add %2,%0\n"
00550       "subl $2, %%ecx\n"
00551       " jnz 1b\n"
00552 
00553       "movq %%mm6, %%mm0\n"
00554       "punpcklwd %%mm7,%%mm0\n"
00555       "punpckhwd %%mm7,%%mm6\n"
00556       "paddd %%mm0, %%mm6\n"
00557 
00558       "movq %%mm6,%%mm0\n"
00559       "psrlq $32, %%mm6\n"
00560       "paddd %%mm6,%%mm0\n"
00561       "movd %%mm0,%1\n"
00562       : "+r" (pix1), "=r"(tmp)
00563       : "r" ((x86_reg)line_size) , "g" (h-2)
00564       : "%ecx");
00565       return tmp + hf_noise8_mmx(pix+8, line_size, h);
00566 }
00567 
00568 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00569     MpegEncContext *c = p;
00570     int score1, score2;
00571 
00572     if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
00573     else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
00574     score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
00575 
00576     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00577     else  return score1 + FFABS(score2)*8;
00578 }
00579 
00580 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00581     MpegEncContext *c = p;
00582     int score1= sse8_mmx(c, pix1, pix2, line_size, h);
00583     int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
00584 
00585     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00586     else  return score1 + FFABS(score2)*8;
00587 }
00588 
00589 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00590     int tmp;
00591 
00592     assert( (((int)pix) & 7) == 0);
00593     assert((line_size &7) ==0);
00594 
00595 #define SUM(in0, in1, out0, out1) \
00596       "movq (%0), %%mm2\n"\
00597       "movq 8(%0), %%mm3\n"\
00598       "add %2,%0\n"\
00599       "movq %%mm2, " #out0 "\n"\
00600       "movq %%mm3, " #out1 "\n"\
00601       "psubusb " #in0 ", %%mm2\n"\
00602       "psubusb " #in1 ", %%mm3\n"\
00603       "psubusb " #out0 ", " #in0 "\n"\
00604       "psubusb " #out1 ", " #in1 "\n"\
00605       "por %%mm2, " #in0 "\n"\
00606       "por %%mm3, " #in1 "\n"\
00607       "movq " #in0 ", %%mm2\n"\
00608       "movq " #in1 ", %%mm3\n"\
00609       "punpcklbw %%mm7, " #in0 "\n"\
00610       "punpcklbw %%mm7, " #in1 "\n"\
00611       "punpckhbw %%mm7, %%mm2\n"\
00612       "punpckhbw %%mm7, %%mm3\n"\
00613       "paddw " #in1 ", " #in0 "\n"\
00614       "paddw %%mm3, %%mm2\n"\
00615       "paddw %%mm2, " #in0 "\n"\
00616       "paddw " #in0 ", %%mm6\n"
00617 
00618 
00619   __asm__ volatile (
00620       "movl %3,%%ecx\n"
00621       "pxor %%mm6,%%mm6\n"
00622       "pxor %%mm7,%%mm7\n"
00623       "movq (%0),%%mm0\n"
00624       "movq 8(%0),%%mm1\n"
00625       "add %2,%0\n"
00626       "jmp 2f\n"
00627       "1:\n"
00628 
00629       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00630       "2:\n"
00631       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00632 
00633       "subl $2, %%ecx\n"
00634       "jnz 1b\n"
00635 
00636       "movq %%mm6,%%mm0\n"
00637       "psrlq $32, %%mm6\n"
00638       "paddw %%mm6,%%mm0\n"
00639       "movq %%mm0,%%mm6\n"
00640       "psrlq $16, %%mm0\n"
00641       "paddw %%mm6,%%mm0\n"
00642       "movd %%mm0,%1\n"
00643       : "+r" (pix), "=r"(tmp)
00644       : "r" ((x86_reg)line_size) , "m" (h)
00645       : "%ecx");
00646     return tmp & 0xFFFF;
00647 }
00648 #undef SUM
00649 
00650 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00651     int tmp;
00652 
00653     assert( (((int)pix) & 7) == 0);
00654     assert((line_size &7) ==0);
00655 
00656 #define SUM(in0, in1, out0, out1) \
00657       "movq (%0), " #out0 "\n"\
00658       "movq 8(%0), " #out1 "\n"\
00659       "add %2,%0\n"\
00660       "psadbw " #out0 ", " #in0 "\n"\
00661       "psadbw " #out1 ", " #in1 "\n"\
00662       "paddw " #in1 ", " #in0 "\n"\
00663       "paddw " #in0 ", %%mm6\n"
00664 
00665   __asm__ volatile (
00666       "movl %3,%%ecx\n"
00667       "pxor %%mm6,%%mm6\n"
00668       "pxor %%mm7,%%mm7\n"
00669       "movq (%0),%%mm0\n"
00670       "movq 8(%0),%%mm1\n"
00671       "add %2,%0\n"
00672       "jmp 2f\n"
00673       "1:\n"
00674 
00675       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00676       "2:\n"
00677       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00678 
00679       "subl $2, %%ecx\n"
00680       "jnz 1b\n"
00681 
00682       "movd %%mm6,%1\n"
00683       : "+r" (pix), "=r"(tmp)
00684       : "r" ((x86_reg)line_size) , "m" (h)
00685       : "%ecx");
00686     return tmp;
00687 }
00688 #undef SUM
00689 
00690 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00691     int tmp;
00692 
00693     assert( (((int)pix1) & 7) == 0);
00694     assert( (((int)pix2) & 7) == 0);
00695     assert((line_size &7) ==0);
00696 
00697 #define SUM(in0, in1, out0, out1) \
00698       "movq (%0),%%mm2\n"\
00699       "movq (%1)," #out0 "\n"\
00700       "movq 8(%0),%%mm3\n"\
00701       "movq 8(%1)," #out1 "\n"\
00702       "add %3,%0\n"\
00703       "add %3,%1\n"\
00704       "psubb " #out0 ", %%mm2\n"\
00705       "psubb " #out1 ", %%mm3\n"\
00706       "pxor %%mm7, %%mm2\n"\
00707       "pxor %%mm7, %%mm3\n"\
00708       "movq %%mm2, " #out0 "\n"\
00709       "movq %%mm3, " #out1 "\n"\
00710       "psubusb " #in0 ", %%mm2\n"\
00711       "psubusb " #in1 ", %%mm3\n"\
00712       "psubusb " #out0 ", " #in0 "\n"\
00713       "psubusb " #out1 ", " #in1 "\n"\
00714       "por %%mm2, " #in0 "\n"\
00715       "por %%mm3, " #in1 "\n"\
00716       "movq " #in0 ", %%mm2\n"\
00717       "movq " #in1 ", %%mm3\n"\
00718       "punpcklbw %%mm7, " #in0 "\n"\
00719       "punpcklbw %%mm7, " #in1 "\n"\
00720       "punpckhbw %%mm7, %%mm2\n"\
00721       "punpckhbw %%mm7, %%mm3\n"\
00722       "paddw " #in1 ", " #in0 "\n"\
00723       "paddw %%mm3, %%mm2\n"\
00724       "paddw %%mm2, " #in0 "\n"\
00725       "paddw " #in0 ", %%mm6\n"
00726 
00727 
00728   __asm__ volatile (
00729       "movl %4,%%ecx\n"
00730       "pxor %%mm6,%%mm6\n"
00731       "pcmpeqw %%mm7,%%mm7\n"
00732       "psllw $15, %%mm7\n"
00733       "packsswb %%mm7, %%mm7\n"
00734       "movq (%0),%%mm0\n"
00735       "movq (%1),%%mm2\n"
00736       "movq 8(%0),%%mm1\n"
00737       "movq 8(%1),%%mm3\n"
00738       "add %3,%0\n"
00739       "add %3,%1\n"
00740       "psubb %%mm2, %%mm0\n"
00741       "psubb %%mm3, %%mm1\n"
00742       "pxor %%mm7, %%mm0\n"
00743       "pxor %%mm7, %%mm1\n"
00744       "jmp 2f\n"
00745       "1:\n"
00746 
00747       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00748       "2:\n"
00749       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00750 
00751       "subl $2, %%ecx\n"
00752       "jnz 1b\n"
00753 
00754       "movq %%mm6,%%mm0\n"
00755       "psrlq $32, %%mm6\n"
00756       "paddw %%mm6,%%mm0\n"
00757       "movq %%mm0,%%mm6\n"
00758       "psrlq $16, %%mm0\n"
00759       "paddw %%mm6,%%mm0\n"
00760       "movd %%mm0,%2\n"
00761       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00762       : "r" ((x86_reg)line_size) , "m" (h)
00763       : "%ecx");
00764     return tmp & 0x7FFF;
00765 }
00766 #undef SUM
00767 
00768 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00769     int tmp;
00770 
00771     assert( (((int)pix1) & 7) == 0);
00772     assert( (((int)pix2) & 7) == 0);
00773     assert((line_size &7) ==0);
00774 
00775 #define SUM(in0, in1, out0, out1) \
00776       "movq (%0)," #out0 "\n"\
00777       "movq (%1),%%mm2\n"\
00778       "movq 8(%0)," #out1 "\n"\
00779       "movq 8(%1),%%mm3\n"\
00780       "add %3,%0\n"\
00781       "add %3,%1\n"\
00782       "psubb %%mm2, " #out0 "\n"\
00783       "psubb %%mm3, " #out1 "\n"\
00784       "pxor %%mm7, " #out0 "\n"\
00785       "pxor %%mm7, " #out1 "\n"\
00786       "psadbw " #out0 ", " #in0 "\n"\
00787       "psadbw " #out1 ", " #in1 "\n"\
00788       "paddw " #in1 ", " #in0 "\n"\
00789       "paddw " #in0 ", %%mm6\n"
00790 
00791   __asm__ volatile (
00792       "movl %4,%%ecx\n"
00793       "pxor %%mm6,%%mm6\n"
00794       "pcmpeqw %%mm7,%%mm7\n"
00795       "psllw $15, %%mm7\n"
00796       "packsswb %%mm7, %%mm7\n"
00797       "movq (%0),%%mm0\n"
00798       "movq (%1),%%mm2\n"
00799       "movq 8(%0),%%mm1\n"
00800       "movq 8(%1),%%mm3\n"
00801       "add %3,%0\n"
00802       "add %3,%1\n"
00803       "psubb %%mm2, %%mm0\n"
00804       "psubb %%mm3, %%mm1\n"
00805       "pxor %%mm7, %%mm0\n"
00806       "pxor %%mm7, %%mm1\n"
00807       "jmp 2f\n"
00808       "1:\n"
00809 
00810       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00811       "2:\n"
00812       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00813 
00814       "subl $2, %%ecx\n"
00815       "jnz 1b\n"
00816 
00817       "movd %%mm6,%2\n"
00818       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00819       : "r" ((x86_reg)line_size) , "m" (h)
00820       : "%ecx");
00821     return tmp;
00822 }
00823 #undef SUM
00824 
00825 static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
00826     x86_reg i=0;
00827     if(w>=16)
00828     __asm__ volatile(
00829         "1:                             \n\t"
00830         "movq  (%2, %0), %%mm0          \n\t"
00831         "movq  (%1, %0), %%mm1          \n\t"
00832         "psubb %%mm0, %%mm1             \n\t"
00833         "movq %%mm1, (%3, %0)           \n\t"
00834         "movq 8(%2, %0), %%mm0          \n\t"
00835         "movq 8(%1, %0), %%mm1          \n\t"
00836         "psubb %%mm0, %%mm1             \n\t"
00837         "movq %%mm1, 8(%3, %0)          \n\t"
00838         "add $16, %0                    \n\t"
00839         "cmp %4, %0                     \n\t"
00840         " jb 1b                         \n\t"
00841         : "+r" (i)
00842         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
00843     );
00844     for(; i<w; i++)
00845         dst[i+0] = src1[i+0]-src2[i+0];
00846 }
00847 
00848 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
00849     x86_reg i=0;
00850     uint8_t l, lt;
00851 
00852     __asm__ volatile(
00853         "movq  (%1, %0), %%mm0          \n\t" 
00854         "psllq $8, %%mm0                \n\t"
00855         "1:                             \n\t"
00856         "movq  (%1, %0), %%mm1          \n\t" 
00857         "movq  -1(%2, %0), %%mm2        \n\t" 
00858         "movq  (%2, %0), %%mm3          \n\t" 
00859         "movq %%mm2, %%mm4              \n\t" 
00860         "psubb %%mm0, %%mm2             \n\t"
00861         "paddb %%mm1, %%mm2             \n\t" 
00862         "movq %%mm4, %%mm5              \n\t" 
00863         "pmaxub %%mm1, %%mm4            \n\t" 
00864         "pminub %%mm5, %%mm1            \n\t" 
00865         "pminub %%mm2, %%mm4            \n\t"
00866         "pmaxub %%mm1, %%mm4            \n\t"
00867         "psubb %%mm4, %%mm3             \n\t" 
00868         "movq %%mm3, (%3, %0)           \n\t"
00869         "add $8, %0                     \n\t"
00870         "movq -1(%1, %0), %%mm0         \n\t" 
00871         "cmp %4, %0                     \n\t"
00872         " jb 1b                         \n\t"
00873         : "+r" (i)
00874         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
00875     );
00876 
00877     l= *left;
00878     lt= *left_top;
00879 
00880     dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
00881 
00882     *left_top= src1[w-1];
00883     *left    = src2[w-1];
00884 }
00885 
00886 #define MMABS_MMX(a,z)\
00887     "pxor " #z ", " #z "              \n\t"\
00888     "pcmpgtw " #a ", " #z "           \n\t"\
00889     "pxor " #z ", " #a "              \n\t"\
00890     "psubw " #z ", " #a "             \n\t"
00891 
00892 #define MMABS_MMX2(a,z)\
00893     "pxor " #z ", " #z "              \n\t"\
00894     "psubw " #a ", " #z "             \n\t"\
00895     "pmaxsw " #z ", " #a "            \n\t"
00896 
00897 #define MMABS_SSSE3(a,z)\
00898     "pabsw " #a ", " #a "             \n\t"
00899 
00900 #define MMABS_SUM(a,z, sum)\
00901     MMABS(a,z)\
00902     "paddusw " #a ", " #sum "         \n\t"
00903 
00904 
00905 
00906 
00907 #define HSUM_MMX(a, t, dst)\
00908     "movq "#a", "#t"                  \n\t"\
00909     "psrlq $32, "#a"                  \n\t"\
00910     "paddusw "#t", "#a"               \n\t"\
00911     "movq "#a", "#t"                  \n\t"\
00912     "psrlq $16, "#a"                  \n\t"\
00913     "paddusw "#t", "#a"               \n\t"\
00914     "movd "#a", "#dst"                \n\t"\
00915 
00916 #define HSUM_MMX2(a, t, dst)\
00917     "pshufw $0x0E, "#a", "#t"         \n\t"\
00918     "paddusw "#t", "#a"               \n\t"\
00919     "pshufw $0x01, "#a", "#t"         \n\t"\
00920     "paddusw "#t", "#a"               \n\t"\
00921     "movd "#a", "#dst"                \n\t"\
00922 
00923 #define HSUM_SSE2(a, t, dst)\
00924     "movhlps "#a", "#t"               \n\t"\
00925     "paddusw "#t", "#a"               \n\t"\
00926     "pshuflw $0x0E, "#a", "#t"        \n\t"\
00927     "paddusw "#t", "#a"               \n\t"\
00928     "pshuflw $0x01, "#a", "#t"        \n\t"\
00929     "paddusw "#t", "#a"               \n\t"\
00930     "movd "#a", "#dst"                \n\t"\
00931 
00932 #define DCT_SAD4(m,mm,o)\
00933     "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
00934     "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
00935     "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
00936     "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
00937     MMABS_SUM(mm##2, mm##6, mm##0)\
00938     MMABS_SUM(mm##3, mm##7, mm##1)\
00939     MMABS_SUM(mm##4, mm##6, mm##0)\
00940     MMABS_SUM(mm##5, mm##7, mm##1)\
00941 
00942 #define DCT_SAD_MMX\
00943     "pxor %%mm0, %%mm0                \n\t"\
00944     "pxor %%mm1, %%mm1                \n\t"\
00945     DCT_SAD4(q, %%mm, 0)\
00946     DCT_SAD4(q, %%mm, 8)\
00947     DCT_SAD4(q, %%mm, 64)\
00948     DCT_SAD4(q, %%mm, 72)\
00949     "paddusw %%mm1, %%mm0             \n\t"\
00950     HSUM(%%mm0, %%mm1, %0)
00951 
00952 #define DCT_SAD_SSE2\
00953     "pxor %%xmm0, %%xmm0              \n\t"\
00954     "pxor %%xmm1, %%xmm1              \n\t"\
00955     DCT_SAD4(dqa, %%xmm, 0)\
00956     DCT_SAD4(dqa, %%xmm, 64)\
00957     "paddusw %%xmm1, %%xmm0           \n\t"\
00958     HSUM(%%xmm0, %%xmm1, %0)
00959 
00960 #define DCT_SAD_FUNC(cpu) \
00961 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
00962     int sum;\
00963     __asm__ volatile(\
00964         DCT_SAD\
00965         :"=r"(sum)\
00966         :"r"(block)\
00967     );\
00968     return sum&0xFFFF;\
00969 }
00970 
00971 #define DCT_SAD       DCT_SAD_MMX
00972 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
00973 #define MMABS(a,z)    MMABS_MMX(a,z)
00974 DCT_SAD_FUNC(mmx)
00975 #undef MMABS
00976 #undef HSUM
00977 
00978 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
00979 #define MMABS(a,z)    MMABS_MMX2(a,z)
00980 DCT_SAD_FUNC(mmx2)
00981 #undef HSUM
00982 #undef DCT_SAD
00983 
00984 #define DCT_SAD       DCT_SAD_SSE2
00985 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
00986 DCT_SAD_FUNC(sse2)
00987 #undef MMABS
00988 
00989 #if HAVE_SSSE3_INLINE
00990 #define MMABS(a,z)    MMABS_SSSE3(a,z)
00991 DCT_SAD_FUNC(ssse3)
00992 #undef MMABS
00993 #endif
00994 #undef HSUM
00995 #undef DCT_SAD
00996 
00997 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
00998     int sum;
00999     x86_reg i=size;
01000     __asm__ volatile(
01001         "pxor %%mm4, %%mm4 \n"
01002         "1: \n"
01003         "sub $8, %0 \n"
01004         "movq (%2,%0), %%mm2 \n"
01005         "movq (%3,%0,2), %%mm0 \n"
01006         "movq 8(%3,%0,2), %%mm1 \n"
01007         "punpckhbw %%mm2, %%mm3 \n"
01008         "punpcklbw %%mm2, %%mm2 \n"
01009         "psraw $8, %%mm3 \n"
01010         "psraw $8, %%mm2 \n"
01011         "psubw %%mm3, %%mm1 \n"
01012         "psubw %%mm2, %%mm0 \n"
01013         "pmaddwd %%mm1, %%mm1 \n"
01014         "pmaddwd %%mm0, %%mm0 \n"
01015         "paddd %%mm1, %%mm4 \n"
01016         "paddd %%mm0, %%mm4 \n"
01017         "jg 1b \n"
01018         "movq %%mm4, %%mm3 \n"
01019         "psrlq $32, %%mm3 \n"
01020         "paddd %%mm3, %%mm4 \n"
01021         "movd %%mm4, %1 \n"
01022         :"+r"(i), "=r"(sum)
01023         :"r"(pix1), "r"(pix2)
01024     );
01025     return sum;
01026 }
01027 
01028 #define PHADDD(a, t)\
01029     "movq "#a", "#t"                  \n\t"\
01030     "psrlq $32, "#a"                  \n\t"\
01031     "paddd "#t", "#a"                 \n\t"
01032 
01033 
01034 
01035 
01036 
01037 #define PMULHRW(x, y, s, o)\
01038     "pmulhw " #s ", "#x "            \n\t"\
01039     "pmulhw " #s ", "#y "            \n\t"\
01040     "paddw " #o ", "#x "             \n\t"\
01041     "paddw " #o ", "#y "             \n\t"\
01042     "psraw $1, "#x "                 \n\t"\
01043     "psraw $1, "#y "                 \n\t"
01044 #define DEF(x) x ## _mmx
01045 #define SET_RND MOVQ_WONE
01046 #define SCALE_OFFSET 1
01047 
01048 #include "dsputil_qns_template.c"
01049 
01050 #undef DEF
01051 #undef SET_RND
01052 #undef SCALE_OFFSET
01053 #undef PMULHRW
01054 
01055 #define DEF(x) x ## _3dnow
01056 #define SET_RND(x)
01057 #define SCALE_OFFSET 0
01058 #define PMULHRW(x, y, s, o)\
01059     "pmulhrw " #s ", "#x "           \n\t"\
01060     "pmulhrw " #s ", "#y "           \n\t"
01061 
01062 #include "dsputil_qns_template.c"
01063 
01064 #undef DEF
01065 #undef SET_RND
01066 #undef SCALE_OFFSET
01067 #undef PMULHRW
01068 
01069 #if HAVE_SSSE3_INLINE
01070 #undef PHADDD
01071 #define DEF(x) x ## _ssse3
01072 #define SET_RND(x)
01073 #define SCALE_OFFSET -1
01074 #define PHADDD(a, t)\
01075     "pshufw $0x0E, "#a", "#t"         \n\t"\
01076     "paddd "#t", "#a"                 \n\t" 
01077 #define PMULHRW(x, y, s, o)\
01078     "pmulhrsw " #s ", "#x "          \n\t"\
01079     "pmulhrsw " #s ", "#y "          \n\t"
01080 
01081 #include "dsputil_qns_template.c"
01082 
01083 #undef DEF
01084 #undef SET_RND
01085 #undef SCALE_OFFSET
01086 #undef PMULHRW
01087 #undef PHADDD
01088 #endif 
01089 
01090 #endif 
01091 
01092 int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
01093 
01094 #define hadamard_func(cpu) \
01095 int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
01096                               int stride, int h); \
01097 int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
01098                               int stride, int h);
01099 
01100 hadamard_func(mmx)
01101 hadamard_func(mmx2)
01102 hadamard_func(sse2)
01103 hadamard_func(ssse3)
01104 
01105 void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
01106 {
01107     int mm_flags = av_get_cpu_flags();
01108 
01109 #if HAVE_INLINE_ASM
01110     int bit_depth = avctx->bits_per_raw_sample;
01111 
01112     if (mm_flags & AV_CPU_FLAG_MMX) {
01113         const int dct_algo = avctx->dct_algo;
01114         if (avctx->bits_per_raw_sample <= 8 &&
01115             (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
01116             if(mm_flags & AV_CPU_FLAG_SSE2){
01117                 c->fdct = ff_fdct_sse2;
01118             } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
01119                 c->fdct = ff_fdct_mmx2;
01120             }else{
01121                 c->fdct = ff_fdct_mmx;
01122             }
01123         }
01124 
01125         if (bit_depth <= 8)
01126             c->get_pixels = get_pixels_mmx;
01127         c->diff_pixels = diff_pixels_mmx;
01128         c->pix_sum = pix_sum16_mmx;
01129 
01130         c->diff_bytes= diff_bytes_mmx;
01131         c->sum_abs_dctelem= sum_abs_dctelem_mmx;
01132 
01133         c->pix_norm1 = pix_norm1_mmx;
01134         c->sse[0] = sse16_mmx;
01135         c->sse[1] = sse8_mmx;
01136         c->vsad[4]= vsad_intra16_mmx;
01137 
01138         c->nsse[0] = nsse16_mmx;
01139         c->nsse[1] = nsse8_mmx;
01140         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01141             c->vsad[0] = vsad16_mmx;
01142         }
01143 
01144         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01145             c->try_8x8basis= try_8x8basis_mmx;
01146         }
01147         c->add_8x8basis= add_8x8basis_mmx;
01148 
01149         c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
01150 
01151         if (mm_flags & AV_CPU_FLAG_MMXEXT) {
01152             c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
01153             c->vsad[4]= vsad_intra16_mmx2;
01154 
01155             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01156                 c->vsad[0] = vsad16_mmx2;
01157             }
01158 
01159             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
01160         }
01161 
01162         if(mm_flags & AV_CPU_FLAG_SSE2){
01163             if (bit_depth <= 8)
01164                 c->get_pixels = get_pixels_sse2;
01165             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
01166         }
01167 
01168 #if HAVE_SSSE3_INLINE
01169         if(mm_flags & AV_CPU_FLAG_SSSE3){
01170             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01171                 c->try_8x8basis= try_8x8basis_ssse3;
01172             }
01173             c->add_8x8basis= add_8x8basis_ssse3;
01174             c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
01175         }
01176 #endif
01177 
01178         if(mm_flags & AV_CPU_FLAG_3DNOW){
01179             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01180                 c->try_8x8basis= try_8x8basis_3dnow;
01181             }
01182             c->add_8x8basis= add_8x8basis_3dnow;
01183         }
01184     }
01185 #endif 
01186 
01187     if (EXTERNAL_MMX(mm_flags)) {
01188         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
01189         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
01190 
01191         if (EXTERNAL_MMXEXT(mm_flags)) {
01192             c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2;
01193             c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2;
01194         }
01195 
01196         if (EXTERNAL_SSE2(mm_flags)) {
01197             c->sse[0] = ff_sse16_sse2;
01198 
01199 #if HAVE_ALIGNED_STACK
01200             c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
01201             c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
01202 #endif
01203         }
01204 
01205         if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) {
01206             c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
01207             c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
01208         }
01209     }
01210 
01211     ff_dsputil_init_pix_mmx(c, avctx);
01212 }