00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/dsputil.h"
00027 #include "libavcodec/mpegvideo.h"
00028 #include "libavcodec/mathops.h"
00029 #include "dsputil_mmx.h"
00030
00031
00032 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00033 {
00034 __asm__ volatile(
00035 "mov $-128, %%"REG_a" \n\t"
00036 "pxor %%mm7, %%mm7 \n\t"
00037 ASMALIGN(4)
00038 "1: \n\t"
00039 "movq (%0), %%mm0 \n\t"
00040 "movq (%0, %2), %%mm2 \n\t"
00041 "movq %%mm0, %%mm1 \n\t"
00042 "movq %%mm2, %%mm3 \n\t"
00043 "punpcklbw %%mm7, %%mm0 \n\t"
00044 "punpckhbw %%mm7, %%mm1 \n\t"
00045 "punpcklbw %%mm7, %%mm2 \n\t"
00046 "punpckhbw %%mm7, %%mm3 \n\t"
00047 "movq %%mm0, (%1, %%"REG_a") \n\t"
00048 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
00049 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
00050 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
00051 "add %3, %0 \n\t"
00052 "add $32, %%"REG_a" \n\t"
00053 "js 1b \n\t"
00054 : "+r" (pixels)
00055 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
00056 : "%"REG_a
00057 );
00058 }
00059
00060 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
00061 {
00062 __asm__ volatile(
00063 "pxor %%xmm7, %%xmm7 \n\t"
00064 "movq (%0), %%xmm0 \n\t"
00065 "movq (%0, %2), %%xmm1 \n\t"
00066 "movq (%0, %2,2), %%xmm2 \n\t"
00067 "movq (%0, %3), %%xmm3 \n\t"
00068 "lea (%0,%2,4), %0 \n\t"
00069 "punpcklbw %%xmm7, %%xmm0 \n\t"
00070 "punpcklbw %%xmm7, %%xmm1 \n\t"
00071 "punpcklbw %%xmm7, %%xmm2 \n\t"
00072 "punpcklbw %%xmm7, %%xmm3 \n\t"
00073 "movdqa %%xmm0, (%1) \n\t"
00074 "movdqa %%xmm1, 16(%1) \n\t"
00075 "movdqa %%xmm2, 32(%1) \n\t"
00076 "movdqa %%xmm3, 48(%1) \n\t"
00077 "movq (%0), %%xmm0 \n\t"
00078 "movq (%0, %2), %%xmm1 \n\t"
00079 "movq (%0, %2,2), %%xmm2 \n\t"
00080 "movq (%0, %3), %%xmm3 \n\t"
00081 "punpcklbw %%xmm7, %%xmm0 \n\t"
00082 "punpcklbw %%xmm7, %%xmm1 \n\t"
00083 "punpcklbw %%xmm7, %%xmm2 \n\t"
00084 "punpcklbw %%xmm7, %%xmm3 \n\t"
00085 "movdqa %%xmm0, 64(%1) \n\t"
00086 "movdqa %%xmm1, 80(%1) \n\t"
00087 "movdqa %%xmm2, 96(%1) \n\t"
00088 "movdqa %%xmm3, 112(%1) \n\t"
00089 : "+r" (pixels)
00090 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
00091 );
00092 }
00093
00094 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00095 {
00096 __asm__ volatile(
00097 "pxor %%mm7, %%mm7 \n\t"
00098 "mov $-128, %%"REG_a" \n\t"
00099 ASMALIGN(4)
00100 "1: \n\t"
00101 "movq (%0), %%mm0 \n\t"
00102 "movq (%1), %%mm2 \n\t"
00103 "movq %%mm0, %%mm1 \n\t"
00104 "movq %%mm2, %%mm3 \n\t"
00105 "punpcklbw %%mm7, %%mm0 \n\t"
00106 "punpckhbw %%mm7, %%mm1 \n\t"
00107 "punpcklbw %%mm7, %%mm2 \n\t"
00108 "punpckhbw %%mm7, %%mm3 \n\t"
00109 "psubw %%mm2, %%mm0 \n\t"
00110 "psubw %%mm3, %%mm1 \n\t"
00111 "movq %%mm0, (%2, %%"REG_a") \n\t"
00112 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
00113 "add %3, %0 \n\t"
00114 "add %3, %1 \n\t"
00115 "add $16, %%"REG_a" \n\t"
00116 "jnz 1b \n\t"
00117 : "+r" (s1), "+r" (s2)
00118 : "r" (block+64), "r" ((x86_reg)stride)
00119 : "%"REG_a
00120 );
00121 }
00122
00123 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00124 const int h=16;
00125 int sum;
00126 x86_reg index= -line_size*h;
00127
00128 __asm__ volatile(
00129 "pxor %%mm7, %%mm7 \n\t"
00130 "pxor %%mm6, %%mm6 \n\t"
00131 "1: \n\t"
00132 "movq (%2, %1), %%mm0 \n\t"
00133 "movq (%2, %1), %%mm1 \n\t"
00134 "movq 8(%2, %1), %%mm2 \n\t"
00135 "movq 8(%2, %1), %%mm3 \n\t"
00136 "punpcklbw %%mm7, %%mm0 \n\t"
00137 "punpckhbw %%mm7, %%mm1 \n\t"
00138 "punpcklbw %%mm7, %%mm2 \n\t"
00139 "punpckhbw %%mm7, %%mm3 \n\t"
00140 "paddw %%mm0, %%mm1 \n\t"
00141 "paddw %%mm2, %%mm3 \n\t"
00142 "paddw %%mm1, %%mm3 \n\t"
00143 "paddw %%mm3, %%mm6 \n\t"
00144 "add %3, %1 \n\t"
00145 " js 1b \n\t"
00146 "movq %%mm6, %%mm5 \n\t"
00147 "psrlq $32, %%mm6 \n\t"
00148 "paddw %%mm5, %%mm6 \n\t"
00149 "movq %%mm6, %%mm5 \n\t"
00150 "psrlq $16, %%mm6 \n\t"
00151 "paddw %%mm5, %%mm6 \n\t"
00152 "movd %%mm6, %0 \n\t"
00153 "andl $0xFFFF, %0 \n\t"
00154 : "=&r" (sum), "+r" (index)
00155 : "r" (pix - index), "r" ((x86_reg)line_size)
00156 );
00157
00158 return sum;
00159 }
00160
00161 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00162 int tmp;
00163 __asm__ volatile (
00164 "movl $16,%%ecx\n"
00165 "pxor %%mm0,%%mm0\n"
00166 "pxor %%mm7,%%mm7\n"
00167 "1:\n"
00168 "movq (%0),%%mm2\n"
00169 "movq 8(%0),%%mm3\n"
00170
00171 "movq %%mm2,%%mm1\n"
00172
00173 "punpckhbw %%mm0,%%mm1\n"
00174 "punpcklbw %%mm0,%%mm2\n"
00175
00176 "movq %%mm3,%%mm4\n"
00177 "punpckhbw %%mm0,%%mm3\n"
00178 "punpcklbw %%mm0,%%mm4\n"
00179
00180 "pmaddwd %%mm1,%%mm1\n"
00181 "pmaddwd %%mm2,%%mm2\n"
00182
00183 "pmaddwd %%mm3,%%mm3\n"
00184 "pmaddwd %%mm4,%%mm4\n"
00185
00186 "paddd %%mm1,%%mm2\n"
00187
00188 "paddd %%mm3,%%mm4\n"
00189 "paddd %%mm2,%%mm7\n"
00190
00191 "add %2, %0\n"
00192 "paddd %%mm4,%%mm7\n"
00193 "dec %%ecx\n"
00194 "jnz 1b\n"
00195
00196 "movq %%mm7,%%mm1\n"
00197 "psrlq $32, %%mm7\n"
00198 "paddd %%mm7,%%mm1\n"
00199 "movd %%mm1,%1\n"
00200 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
00201 return tmp;
00202 }
00203
00204 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00205 int tmp;
00206 __asm__ volatile (
00207 "movl %4,%%ecx\n"
00208 "shr $1,%%ecx\n"
00209 "pxor %%mm0,%%mm0\n"
00210 "pxor %%mm7,%%mm7\n"
00211 "1:\n"
00212 "movq (%0),%%mm1\n"
00213 "movq (%1),%%mm2\n"
00214 "movq (%0,%3),%%mm3\n"
00215 "movq (%1,%3),%%mm4\n"
00216
00217
00218
00219
00220 "movq %%mm1,%%mm5\n"
00221 "movq %%mm3,%%mm6\n"
00222 "psubusb %%mm2,%%mm1\n"
00223 "psubusb %%mm4,%%mm3\n"
00224 "psubusb %%mm5,%%mm2\n"
00225 "psubusb %%mm6,%%mm4\n"
00226
00227 "por %%mm1,%%mm2\n"
00228 "por %%mm3,%%mm4\n"
00229
00230
00231 "movq %%mm2,%%mm1\n"
00232 "movq %%mm4,%%mm3\n"
00233
00234 "punpckhbw %%mm0,%%mm2\n"
00235 "punpckhbw %%mm0,%%mm4\n"
00236 "punpcklbw %%mm0,%%mm1\n"
00237 "punpcklbw %%mm0,%%mm3\n"
00238
00239 "pmaddwd %%mm2,%%mm2\n"
00240 "pmaddwd %%mm4,%%mm4\n"
00241 "pmaddwd %%mm1,%%mm1\n"
00242 "pmaddwd %%mm3,%%mm3\n"
00243
00244 "lea (%0,%3,2), %0\n"
00245 "lea (%1,%3,2), %1\n"
00246
00247 "paddd %%mm2,%%mm1\n"
00248 "paddd %%mm4,%%mm3\n"
00249 "paddd %%mm1,%%mm7\n"
00250 "paddd %%mm3,%%mm7\n"
00251
00252 "decl %%ecx\n"
00253 "jnz 1b\n"
00254
00255 "movq %%mm7,%%mm1\n"
00256 "psrlq $32, %%mm7\n"
00257 "paddd %%mm7,%%mm1\n"
00258 "movd %%mm1,%2\n"
00259 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00260 : "r" ((x86_reg)line_size) , "m" (h)
00261 : "%ecx");
00262 return tmp;
00263 }
00264
00265 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00266 int tmp;
00267 __asm__ volatile (
00268 "movl %4,%%ecx\n"
00269 "pxor %%mm0,%%mm0\n"
00270 "pxor %%mm7,%%mm7\n"
00271 "1:\n"
00272 "movq (%0),%%mm1\n"
00273 "movq (%1),%%mm2\n"
00274 "movq 8(%0),%%mm3\n"
00275 "movq 8(%1),%%mm4\n"
00276
00277
00278
00279
00280 "movq %%mm1,%%mm5\n"
00281 "movq %%mm3,%%mm6\n"
00282 "psubusb %%mm2,%%mm1\n"
00283 "psubusb %%mm4,%%mm3\n"
00284 "psubusb %%mm5,%%mm2\n"
00285 "psubusb %%mm6,%%mm4\n"
00286
00287 "por %%mm1,%%mm2\n"
00288 "por %%mm3,%%mm4\n"
00289
00290
00291 "movq %%mm2,%%mm1\n"
00292 "movq %%mm4,%%mm3\n"
00293
00294 "punpckhbw %%mm0,%%mm2\n"
00295 "punpckhbw %%mm0,%%mm4\n"
00296 "punpcklbw %%mm0,%%mm1\n"
00297 "punpcklbw %%mm0,%%mm3\n"
00298
00299 "pmaddwd %%mm2,%%mm2\n"
00300 "pmaddwd %%mm4,%%mm4\n"
00301 "pmaddwd %%mm1,%%mm1\n"
00302 "pmaddwd %%mm3,%%mm3\n"
00303
00304 "add %3,%0\n"
00305 "add %3,%1\n"
00306
00307 "paddd %%mm2,%%mm1\n"
00308 "paddd %%mm4,%%mm3\n"
00309 "paddd %%mm1,%%mm7\n"
00310 "paddd %%mm3,%%mm7\n"
00311
00312 "decl %%ecx\n"
00313 "jnz 1b\n"
00314
00315 "movq %%mm7,%%mm1\n"
00316 "psrlq $32, %%mm7\n"
00317 "paddd %%mm7,%%mm1\n"
00318 "movd %%mm1,%2\n"
00319 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00320 : "r" ((x86_reg)line_size) , "m" (h)
00321 : "%ecx");
00322 return tmp;
00323 }
00324
00325 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00326 int tmp;
00327 __asm__ volatile (
00328 "shr $1,%2\n"
00329 "pxor %%xmm0,%%xmm0\n"
00330 "pxor %%xmm7,%%xmm7\n"
00331 "1:\n"
00332 "movdqu (%0),%%xmm1\n"
00333 "movdqu (%1),%%xmm2\n"
00334 "movdqu (%0,%4),%%xmm3\n"
00335 "movdqu (%1,%4),%%xmm4\n"
00336
00337
00338
00339
00340 "movdqa %%xmm1,%%xmm5\n"
00341 "movdqa %%xmm3,%%xmm6\n"
00342 "psubusb %%xmm2,%%xmm1\n"
00343 "psubusb %%xmm4,%%xmm3\n"
00344 "psubusb %%xmm5,%%xmm2\n"
00345 "psubusb %%xmm6,%%xmm4\n"
00346
00347 "por %%xmm1,%%xmm2\n"
00348 "por %%xmm3,%%xmm4\n"
00349
00350
00351 "movdqa %%xmm2,%%xmm1\n"
00352 "movdqa %%xmm4,%%xmm3\n"
00353
00354 "punpckhbw %%xmm0,%%xmm2\n"
00355 "punpckhbw %%xmm0,%%xmm4\n"
00356 "punpcklbw %%xmm0,%%xmm1\n"
00357 "punpcklbw %%xmm0,%%xmm3\n"
00358
00359 "pmaddwd %%xmm2,%%xmm2\n"
00360 "pmaddwd %%xmm4,%%xmm4\n"
00361 "pmaddwd %%xmm1,%%xmm1\n"
00362 "pmaddwd %%xmm3,%%xmm3\n"
00363
00364 "lea (%0,%4,2), %0\n"
00365 "lea (%1,%4,2), %1\n"
00366
00367 "paddd %%xmm2,%%xmm1\n"
00368 "paddd %%xmm4,%%xmm3\n"
00369 "paddd %%xmm1,%%xmm7\n"
00370 "paddd %%xmm3,%%xmm7\n"
00371
00372 "decl %2\n"
00373 "jnz 1b\n"
00374
00375 "movdqa %%xmm7,%%xmm1\n"
00376 "psrldq $8, %%xmm7\n"
00377 "paddd %%xmm1,%%xmm7\n"
00378 "movdqa %%xmm7,%%xmm1\n"
00379 "psrldq $4, %%xmm7\n"
00380 "paddd %%xmm1,%%xmm7\n"
00381 "movd %%xmm7,%3\n"
00382 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
00383 : "r" ((x86_reg)line_size));
00384 return tmp;
00385 }
00386
00387 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00388 int tmp;
00389 __asm__ volatile (
00390 "movl %3,%%ecx\n"
00391 "pxor %%mm7,%%mm7\n"
00392 "pxor %%mm6,%%mm6\n"
00393
00394 "movq (%0),%%mm0\n"
00395 "movq %%mm0, %%mm1\n"
00396 "psllq $8, %%mm0\n"
00397 "psrlq $8, %%mm1\n"
00398 "psrlq $8, %%mm0\n"
00399 "movq %%mm0, %%mm2\n"
00400 "movq %%mm1, %%mm3\n"
00401 "punpcklbw %%mm7,%%mm0\n"
00402 "punpcklbw %%mm7,%%mm1\n"
00403 "punpckhbw %%mm7,%%mm2\n"
00404 "punpckhbw %%mm7,%%mm3\n"
00405 "psubw %%mm1, %%mm0\n"
00406 "psubw %%mm3, %%mm2\n"
00407
00408 "add %2,%0\n"
00409
00410 "movq (%0),%%mm4\n"
00411 "movq %%mm4, %%mm1\n"
00412 "psllq $8, %%mm4\n"
00413 "psrlq $8, %%mm1\n"
00414 "psrlq $8, %%mm4\n"
00415 "movq %%mm4, %%mm5\n"
00416 "movq %%mm1, %%mm3\n"
00417 "punpcklbw %%mm7,%%mm4\n"
00418 "punpcklbw %%mm7,%%mm1\n"
00419 "punpckhbw %%mm7,%%mm5\n"
00420 "punpckhbw %%mm7,%%mm3\n"
00421 "psubw %%mm1, %%mm4\n"
00422 "psubw %%mm3, %%mm5\n"
00423 "psubw %%mm4, %%mm0\n"
00424 "psubw %%mm5, %%mm2\n"
00425 "pxor %%mm3, %%mm3\n"
00426 "pxor %%mm1, %%mm1\n"
00427 "pcmpgtw %%mm0, %%mm3\n\t"
00428 "pcmpgtw %%mm2, %%mm1\n\t"
00429 "pxor %%mm3, %%mm0\n"
00430 "pxor %%mm1, %%mm2\n"
00431 "psubw %%mm3, %%mm0\n"
00432 "psubw %%mm1, %%mm2\n"
00433 "paddw %%mm0, %%mm2\n"
00434 "paddw %%mm2, %%mm6\n"
00435
00436 "add %2,%0\n"
00437 "1:\n"
00438
00439 "movq (%0),%%mm0\n"
00440 "movq %%mm0, %%mm1\n"
00441 "psllq $8, %%mm0\n"
00442 "psrlq $8, %%mm1\n"
00443 "psrlq $8, %%mm0\n"
00444 "movq %%mm0, %%mm2\n"
00445 "movq %%mm1, %%mm3\n"
00446 "punpcklbw %%mm7,%%mm0\n"
00447 "punpcklbw %%mm7,%%mm1\n"
00448 "punpckhbw %%mm7,%%mm2\n"
00449 "punpckhbw %%mm7,%%mm3\n"
00450 "psubw %%mm1, %%mm0\n"
00451 "psubw %%mm3, %%mm2\n"
00452 "psubw %%mm0, %%mm4\n"
00453 "psubw %%mm2, %%mm5\n"
00454 "pxor %%mm3, %%mm3\n"
00455 "pxor %%mm1, %%mm1\n"
00456 "pcmpgtw %%mm4, %%mm3\n\t"
00457 "pcmpgtw %%mm5, %%mm1\n\t"
00458 "pxor %%mm3, %%mm4\n"
00459 "pxor %%mm1, %%mm5\n"
00460 "psubw %%mm3, %%mm4\n"
00461 "psubw %%mm1, %%mm5\n"
00462 "paddw %%mm4, %%mm5\n"
00463 "paddw %%mm5, %%mm6\n"
00464
00465 "add %2,%0\n"
00466
00467 "movq (%0),%%mm4\n"
00468 "movq %%mm4, %%mm1\n"
00469 "psllq $8, %%mm4\n"
00470 "psrlq $8, %%mm1\n"
00471 "psrlq $8, %%mm4\n"
00472 "movq %%mm4, %%mm5\n"
00473 "movq %%mm1, %%mm3\n"
00474 "punpcklbw %%mm7,%%mm4\n"
00475 "punpcklbw %%mm7,%%mm1\n"
00476 "punpckhbw %%mm7,%%mm5\n"
00477 "punpckhbw %%mm7,%%mm3\n"
00478 "psubw %%mm1, %%mm4\n"
00479 "psubw %%mm3, %%mm5\n"
00480 "psubw %%mm4, %%mm0\n"
00481 "psubw %%mm5, %%mm2\n"
00482 "pxor %%mm3, %%mm3\n"
00483 "pxor %%mm1, %%mm1\n"
00484 "pcmpgtw %%mm0, %%mm3\n\t"
00485 "pcmpgtw %%mm2, %%mm1\n\t"
00486 "pxor %%mm3, %%mm0\n"
00487 "pxor %%mm1, %%mm2\n"
00488 "psubw %%mm3, %%mm0\n"
00489 "psubw %%mm1, %%mm2\n"
00490 "paddw %%mm0, %%mm2\n"
00491 "paddw %%mm2, %%mm6\n"
00492
00493 "add %2,%0\n"
00494 "subl $2, %%ecx\n"
00495 " jnz 1b\n"
00496
00497 "movq %%mm6, %%mm0\n"
00498 "punpcklwd %%mm7,%%mm0\n"
00499 "punpckhwd %%mm7,%%mm6\n"
00500 "paddd %%mm0, %%mm6\n"
00501
00502 "movq %%mm6,%%mm0\n"
00503 "psrlq $32, %%mm6\n"
00504 "paddd %%mm6,%%mm0\n"
00505 "movd %%mm0,%1\n"
00506 : "+r" (pix1), "=r"(tmp)
00507 : "r" ((x86_reg)line_size) , "g" (h-2)
00508 : "%ecx");
00509 return tmp;
00510 }
00511
00512 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
00513 int tmp;
00514 uint8_t * pix= pix1;
00515 __asm__ volatile (
00516 "movl %3,%%ecx\n"
00517 "pxor %%mm7,%%mm7\n"
00518 "pxor %%mm6,%%mm6\n"
00519
00520 "movq (%0),%%mm0\n"
00521 "movq 1(%0),%%mm1\n"
00522 "movq %%mm0, %%mm2\n"
00523 "movq %%mm1, %%mm3\n"
00524 "punpcklbw %%mm7,%%mm0\n"
00525 "punpcklbw %%mm7,%%mm1\n"
00526 "punpckhbw %%mm7,%%mm2\n"
00527 "punpckhbw %%mm7,%%mm3\n"
00528 "psubw %%mm1, %%mm0\n"
00529 "psubw %%mm3, %%mm2\n"
00530
00531 "add %2,%0\n"
00532
00533 "movq (%0),%%mm4\n"
00534 "movq 1(%0),%%mm1\n"
00535 "movq %%mm4, %%mm5\n"
00536 "movq %%mm1, %%mm3\n"
00537 "punpcklbw %%mm7,%%mm4\n"
00538 "punpcklbw %%mm7,%%mm1\n"
00539 "punpckhbw %%mm7,%%mm5\n"
00540 "punpckhbw %%mm7,%%mm3\n"
00541 "psubw %%mm1, %%mm4\n"
00542 "psubw %%mm3, %%mm5\n"
00543 "psubw %%mm4, %%mm0\n"
00544 "psubw %%mm5, %%mm2\n"
00545 "pxor %%mm3, %%mm3\n"
00546 "pxor %%mm1, %%mm1\n"
00547 "pcmpgtw %%mm0, %%mm3\n\t"
00548 "pcmpgtw %%mm2, %%mm1\n\t"
00549 "pxor %%mm3, %%mm0\n"
00550 "pxor %%mm1, %%mm2\n"
00551 "psubw %%mm3, %%mm0\n"
00552 "psubw %%mm1, %%mm2\n"
00553 "paddw %%mm0, %%mm2\n"
00554 "paddw %%mm2, %%mm6\n"
00555
00556 "add %2,%0\n"
00557 "1:\n"
00558
00559 "movq (%0),%%mm0\n"
00560 "movq 1(%0),%%mm1\n"
00561 "movq %%mm0, %%mm2\n"
00562 "movq %%mm1, %%mm3\n"
00563 "punpcklbw %%mm7,%%mm0\n"
00564 "punpcklbw %%mm7,%%mm1\n"
00565 "punpckhbw %%mm7,%%mm2\n"
00566 "punpckhbw %%mm7,%%mm3\n"
00567 "psubw %%mm1, %%mm0\n"
00568 "psubw %%mm3, %%mm2\n"
00569 "psubw %%mm0, %%mm4\n"
00570 "psubw %%mm2, %%mm5\n"
00571 "pxor %%mm3, %%mm3\n"
00572 "pxor %%mm1, %%mm1\n"
00573 "pcmpgtw %%mm4, %%mm3\n\t"
00574 "pcmpgtw %%mm5, %%mm1\n\t"
00575 "pxor %%mm3, %%mm4\n"
00576 "pxor %%mm1, %%mm5\n"
00577 "psubw %%mm3, %%mm4\n"
00578 "psubw %%mm1, %%mm5\n"
00579 "paddw %%mm4, %%mm5\n"
00580 "paddw %%mm5, %%mm6\n"
00581
00582 "add %2,%0\n"
00583
00584 "movq (%0),%%mm4\n"
00585 "movq 1(%0),%%mm1\n"
00586 "movq %%mm4, %%mm5\n"
00587 "movq %%mm1, %%mm3\n"
00588 "punpcklbw %%mm7,%%mm4\n"
00589 "punpcklbw %%mm7,%%mm1\n"
00590 "punpckhbw %%mm7,%%mm5\n"
00591 "punpckhbw %%mm7,%%mm3\n"
00592 "psubw %%mm1, %%mm4\n"
00593 "psubw %%mm3, %%mm5\n"
00594 "psubw %%mm4, %%mm0\n"
00595 "psubw %%mm5, %%mm2\n"
00596 "pxor %%mm3, %%mm3\n"
00597 "pxor %%mm1, %%mm1\n"
00598 "pcmpgtw %%mm0, %%mm3\n\t"
00599 "pcmpgtw %%mm2, %%mm1\n\t"
00600 "pxor %%mm3, %%mm0\n"
00601 "pxor %%mm1, %%mm2\n"
00602 "psubw %%mm3, %%mm0\n"
00603 "psubw %%mm1, %%mm2\n"
00604 "paddw %%mm0, %%mm2\n"
00605 "paddw %%mm2, %%mm6\n"
00606
00607 "add %2,%0\n"
00608 "subl $2, %%ecx\n"
00609 " jnz 1b\n"
00610
00611 "movq %%mm6, %%mm0\n"
00612 "punpcklwd %%mm7,%%mm0\n"
00613 "punpckhwd %%mm7,%%mm6\n"
00614 "paddd %%mm0, %%mm6\n"
00615
00616 "movq %%mm6,%%mm0\n"
00617 "psrlq $32, %%mm6\n"
00618 "paddd %%mm6,%%mm0\n"
00619 "movd %%mm0,%1\n"
00620 : "+r" (pix1), "=r"(tmp)
00621 : "r" ((x86_reg)line_size) , "g" (h-2)
00622 : "%ecx");
00623 return tmp + hf_noise8_mmx(pix+8, line_size, h);
00624 }
00625
00626 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00627 MpegEncContext *c = p;
00628 int score1, score2;
00629
00630 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
00631 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
00632 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
00633
00634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00635 else return score1 + FFABS(score2)*8;
00636 }
00637
00638 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00639 MpegEncContext *c = p;
00640 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
00641 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
00642
00643 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00644 else return score1 + FFABS(score2)*8;
00645 }
00646
00647 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00648 int tmp;
00649
00650 assert( (((int)pix) & 7) == 0);
00651 assert((line_size &7) ==0);
00652
00653 #define SUM(in0, in1, out0, out1) \
00654 "movq (%0), %%mm2\n"\
00655 "movq 8(%0), %%mm3\n"\
00656 "add %2,%0\n"\
00657 "movq %%mm2, " #out0 "\n"\
00658 "movq %%mm3, " #out1 "\n"\
00659 "psubusb " #in0 ", %%mm2\n"\
00660 "psubusb " #in1 ", %%mm3\n"\
00661 "psubusb " #out0 ", " #in0 "\n"\
00662 "psubusb " #out1 ", " #in1 "\n"\
00663 "por %%mm2, " #in0 "\n"\
00664 "por %%mm3, " #in1 "\n"\
00665 "movq " #in0 ", %%mm2\n"\
00666 "movq " #in1 ", %%mm3\n"\
00667 "punpcklbw %%mm7, " #in0 "\n"\
00668 "punpcklbw %%mm7, " #in1 "\n"\
00669 "punpckhbw %%mm7, %%mm2\n"\
00670 "punpckhbw %%mm7, %%mm3\n"\
00671 "paddw " #in1 ", " #in0 "\n"\
00672 "paddw %%mm3, %%mm2\n"\
00673 "paddw %%mm2, " #in0 "\n"\
00674 "paddw " #in0 ", %%mm6\n"
00675
00676
00677 __asm__ volatile (
00678 "movl %3,%%ecx\n"
00679 "pxor %%mm6,%%mm6\n"
00680 "pxor %%mm7,%%mm7\n"
00681 "movq (%0),%%mm0\n"
00682 "movq 8(%0),%%mm1\n"
00683 "add %2,%0\n"
00684 "jmp 2f\n"
00685 "1:\n"
00686
00687 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00688 "2:\n"
00689 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00690
00691 "subl $2, %%ecx\n"
00692 "jnz 1b\n"
00693
00694 "movq %%mm6,%%mm0\n"
00695 "psrlq $32, %%mm6\n"
00696 "paddw %%mm6,%%mm0\n"
00697 "movq %%mm0,%%mm6\n"
00698 "psrlq $16, %%mm0\n"
00699 "paddw %%mm6,%%mm0\n"
00700 "movd %%mm0,%1\n"
00701 : "+r" (pix), "=r"(tmp)
00702 : "r" ((x86_reg)line_size) , "m" (h)
00703 : "%ecx");
00704 return tmp & 0xFFFF;
00705 }
00706 #undef SUM
00707
00708 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00709 int tmp;
00710
00711 assert( (((int)pix) & 7) == 0);
00712 assert((line_size &7) ==0);
00713
00714 #define SUM(in0, in1, out0, out1) \
00715 "movq (%0), " #out0 "\n"\
00716 "movq 8(%0), " #out1 "\n"\
00717 "add %2,%0\n"\
00718 "psadbw " #out0 ", " #in0 "\n"\
00719 "psadbw " #out1 ", " #in1 "\n"\
00720 "paddw " #in1 ", " #in0 "\n"\
00721 "paddw " #in0 ", %%mm6\n"
00722
00723 __asm__ volatile (
00724 "movl %3,%%ecx\n"
00725 "pxor %%mm6,%%mm6\n"
00726 "pxor %%mm7,%%mm7\n"
00727 "movq (%0),%%mm0\n"
00728 "movq 8(%0),%%mm1\n"
00729 "add %2,%0\n"
00730 "jmp 2f\n"
00731 "1:\n"
00732
00733 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00734 "2:\n"
00735 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00736
00737 "subl $2, %%ecx\n"
00738 "jnz 1b\n"
00739
00740 "movd %%mm6,%1\n"
00741 : "+r" (pix), "=r"(tmp)
00742 : "r" ((x86_reg)line_size) , "m" (h)
00743 : "%ecx");
00744 return tmp;
00745 }
00746 #undef SUM
00747
00748 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00749 int tmp;
00750
00751 assert( (((int)pix1) & 7) == 0);
00752 assert( (((int)pix2) & 7) == 0);
00753 assert((line_size &7) ==0);
00754
00755 #define SUM(in0, in1, out0, out1) \
00756 "movq (%0),%%mm2\n"\
00757 "movq (%1)," #out0 "\n"\
00758 "movq 8(%0),%%mm3\n"\
00759 "movq 8(%1)," #out1 "\n"\
00760 "add %3,%0\n"\
00761 "add %3,%1\n"\
00762 "psubb " #out0 ", %%mm2\n"\
00763 "psubb " #out1 ", %%mm3\n"\
00764 "pxor %%mm7, %%mm2\n"\
00765 "pxor %%mm7, %%mm3\n"\
00766 "movq %%mm2, " #out0 "\n"\
00767 "movq %%mm3, " #out1 "\n"\
00768 "psubusb " #in0 ", %%mm2\n"\
00769 "psubusb " #in1 ", %%mm3\n"\
00770 "psubusb " #out0 ", " #in0 "\n"\
00771 "psubusb " #out1 ", " #in1 "\n"\
00772 "por %%mm2, " #in0 "\n"\
00773 "por %%mm3, " #in1 "\n"\
00774 "movq " #in0 ", %%mm2\n"\
00775 "movq " #in1 ", %%mm3\n"\
00776 "punpcklbw %%mm7, " #in0 "\n"\
00777 "punpcklbw %%mm7, " #in1 "\n"\
00778 "punpckhbw %%mm7, %%mm2\n"\
00779 "punpckhbw %%mm7, %%mm3\n"\
00780 "paddw " #in1 ", " #in0 "\n"\
00781 "paddw %%mm3, %%mm2\n"\
00782 "paddw %%mm2, " #in0 "\n"\
00783 "paddw " #in0 ", %%mm6\n"
00784
00785
00786 __asm__ volatile (
00787 "movl %4,%%ecx\n"
00788 "pxor %%mm6,%%mm6\n"
00789 "pcmpeqw %%mm7,%%mm7\n"
00790 "psllw $15, %%mm7\n"
00791 "packsswb %%mm7, %%mm7\n"
00792 "movq (%0),%%mm0\n"
00793 "movq (%1),%%mm2\n"
00794 "movq 8(%0),%%mm1\n"
00795 "movq 8(%1),%%mm3\n"
00796 "add %3,%0\n"
00797 "add %3,%1\n"
00798 "psubb %%mm2, %%mm0\n"
00799 "psubb %%mm3, %%mm1\n"
00800 "pxor %%mm7, %%mm0\n"
00801 "pxor %%mm7, %%mm1\n"
00802 "jmp 2f\n"
00803 "1:\n"
00804
00805 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00806 "2:\n"
00807 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00808
00809 "subl $2, %%ecx\n"
00810 "jnz 1b\n"
00811
00812 "movq %%mm6,%%mm0\n"
00813 "psrlq $32, %%mm6\n"
00814 "paddw %%mm6,%%mm0\n"
00815 "movq %%mm0,%%mm6\n"
00816 "psrlq $16, %%mm0\n"
00817 "paddw %%mm6,%%mm0\n"
00818 "movd %%mm0,%2\n"
00819 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00820 : "r" ((x86_reg)line_size) , "m" (h)
00821 : "%ecx");
00822 return tmp & 0x7FFF;
00823 }
00824 #undef SUM
00825
00826 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00827 int tmp;
00828
00829 assert( (((int)pix1) & 7) == 0);
00830 assert( (((int)pix2) & 7) == 0);
00831 assert((line_size &7) ==0);
00832
00833 #define SUM(in0, in1, out0, out1) \
00834 "movq (%0)," #out0 "\n"\
00835 "movq (%1),%%mm2\n"\
00836 "movq 8(%0)," #out1 "\n"\
00837 "movq 8(%1),%%mm3\n"\
00838 "add %3,%0\n"\
00839 "add %3,%1\n"\
00840 "psubb %%mm2, " #out0 "\n"\
00841 "psubb %%mm3, " #out1 "\n"\
00842 "pxor %%mm7, " #out0 "\n"\
00843 "pxor %%mm7, " #out1 "\n"\
00844 "psadbw " #out0 ", " #in0 "\n"\
00845 "psadbw " #out1 ", " #in1 "\n"\
00846 "paddw " #in1 ", " #in0 "\n"\
00847 "paddw " #in0 ", %%mm6\n"
00848
00849 __asm__ volatile (
00850 "movl %4,%%ecx\n"
00851 "pxor %%mm6,%%mm6\n"
00852 "pcmpeqw %%mm7,%%mm7\n"
00853 "psllw $15, %%mm7\n"
00854 "packsswb %%mm7, %%mm7\n"
00855 "movq (%0),%%mm0\n"
00856 "movq (%1),%%mm2\n"
00857 "movq 8(%0),%%mm1\n"
00858 "movq 8(%1),%%mm3\n"
00859 "add %3,%0\n"
00860 "add %3,%1\n"
00861 "psubb %%mm2, %%mm0\n"
00862 "psubb %%mm3, %%mm1\n"
00863 "pxor %%mm7, %%mm0\n"
00864 "pxor %%mm7, %%mm1\n"
00865 "jmp 2f\n"
00866 "1:\n"
00867
00868 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00869 "2:\n"
00870 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00871
00872 "subl $2, %%ecx\n"
00873 "jnz 1b\n"
00874
00875 "movd %%mm6,%2\n"
00876 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00877 : "r" ((x86_reg)line_size) , "m" (h)
00878 : "%ecx");
00879 return tmp;
00880 }
00881 #undef SUM
00882
00883 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00884 x86_reg i=0;
00885 if(w>=16)
00886 __asm__ volatile(
00887 "1: \n\t"
00888 "movq (%2, %0), %%mm0 \n\t"
00889 "movq (%1, %0), %%mm1 \n\t"
00890 "psubb %%mm0, %%mm1 \n\t"
00891 "movq %%mm1, (%3, %0) \n\t"
00892 "movq 8(%2, %0), %%mm0 \n\t"
00893 "movq 8(%1, %0), %%mm1 \n\t"
00894 "psubb %%mm0, %%mm1 \n\t"
00895 "movq %%mm1, 8(%3, %0) \n\t"
00896 "add $16, %0 \n\t"
00897 "cmp %4, %0 \n\t"
00898 " jb 1b \n\t"
00899 : "+r" (i)
00900 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
00901 );
00902 for(; i<w; i++)
00903 dst[i+0] = src1[i+0]-src2[i+0];
00904 }
00905
00906 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
00907 x86_reg i=0;
00908 uint8_t l, lt;
00909
00910 __asm__ volatile(
00911 "1: \n\t"
00912 "movq -1(%1, %0), %%mm0 \n\t"
00913 "movq (%1, %0), %%mm1 \n\t"
00914 "movq -1(%2, %0), %%mm2 \n\t"
00915 "movq (%2, %0), %%mm3 \n\t"
00916 "movq %%mm2, %%mm4 \n\t"
00917 "psubb %%mm0, %%mm2 \n\t"
00918 "paddb %%mm1, %%mm2 \n\t"
00919 "movq %%mm4, %%mm5 \n\t"
00920 "pmaxub %%mm1, %%mm4 \n\t"
00921 "pminub %%mm5, %%mm1 \n\t"
00922 "pminub %%mm2, %%mm4 \n\t"
00923 "pmaxub %%mm1, %%mm4 \n\t"
00924 "psubb %%mm4, %%mm3 \n\t"
00925 "movq %%mm3, (%3, %0) \n\t"
00926 "add $8, %0 \n\t"
00927 "cmp %4, %0 \n\t"
00928 " jb 1b \n\t"
00929 : "+r" (i)
00930 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
00931 );
00932
00933 l= *left;
00934 lt= *left_top;
00935
00936 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
00937
00938 *left_top= src1[w-1];
00939 *left = src2[w-1];
00940 }
00941
00942 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
00943 "mov"#m" "#p1", "#a" \n\t"\
00944 "mov"#m" "#p2", "#t" \n\t"\
00945 "punpcklbw "#a", "#t" \n\t"\
00946 "punpcklbw "#a", "#a" \n\t"\
00947 "psubw "#t", "#a" \n\t"\
00948
00949 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
00950 uint8_t *p1b=p1, *p2b=p2;\
00951 __asm__ volatile(\
00952 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
00953 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
00954 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
00955 "add %4, %1 \n\t"\
00956 "add %4, %2 \n\t"\
00957 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
00958 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
00959 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
00960 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
00961 "mov"#m1" "#mm"0, %0 \n\t"\
00962 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
00963 "mov"#m1" %0, "#mm"0 \n\t"\
00964 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
00965 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
00966 );\
00967 }
00968
00969
00970 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
00971 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
00972
00973 #define LBUTTERFLY2(a1,b1,a2,b2)\
00974 "paddw " #b1 ", " #a1 " \n\t"\
00975 "paddw " #b2 ", " #a2 " \n\t"\
00976 "paddw " #b1 ", " #b1 " \n\t"\
00977 "paddw " #b2 ", " #b2 " \n\t"\
00978 "psubw " #a1 ", " #b1 " \n\t"\
00979 "psubw " #a2 ", " #b2 " \n\t"
00980
00981 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
00982 LBUTTERFLY2(m0, m1, m2, m3)\
00983 LBUTTERFLY2(m4, m5, m6, m7)\
00984 LBUTTERFLY2(m0, m2, m1, m3)\
00985 LBUTTERFLY2(m4, m6, m5, m7)\
00986 LBUTTERFLY2(m0, m4, m1, m5)\
00987 LBUTTERFLY2(m2, m6, m3, m7)\
00988
00989 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
00990
00991 #define MMABS_MMX(a,z)\
00992 "pxor " #z ", " #z " \n\t"\
00993 "pcmpgtw " #a ", " #z " \n\t"\
00994 "pxor " #z ", " #a " \n\t"\
00995 "psubw " #z ", " #a " \n\t"
00996
00997 #define MMABS_MMX2(a,z)\
00998 "pxor " #z ", " #z " \n\t"\
00999 "psubw " #a ", " #z " \n\t"\
01000 "pmaxsw " #z ", " #a " \n\t"
01001
01002 #define MMABS_SSSE3(a,z)\
01003 "pabsw " #a ", " #a " \n\t"
01004
01005 #define MMABS_SUM(a,z, sum)\
01006 MMABS(a,z)\
01007 "paddusw " #a ", " #sum " \n\t"
01008
01009 #define MMABS_SUM_8x8_NOSPILL\
01010 MMABS(%%xmm0, %%xmm8)\
01011 MMABS(%%xmm1, %%xmm9)\
01012 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
01013 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
01014 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
01015 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
01016 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
01017 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
01018 "paddusw %%xmm1, %%xmm0 \n\t"
01019
01020 #if ARCH_X86_64
01021 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
01022 #else
01023 #define MMABS_SUM_8x8_SSE2\
01024 "movdqa %%xmm7, (%1) \n\t"\
01025 MMABS(%%xmm0, %%xmm7)\
01026 MMABS(%%xmm1, %%xmm7)\
01027 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
01028 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
01029 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
01030 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
01031 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
01032 "movdqa (%1), %%xmm2 \n\t"\
01033 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
01034 "paddusw %%xmm1, %%xmm0 \n\t"
01035 #endif
01036
01037
01038
01039
01040 #define HSUM_MMX(a, t, dst)\
01041 "movq "#a", "#t" \n\t"\
01042 "psrlq $32, "#a" \n\t"\
01043 "paddusw "#t", "#a" \n\t"\
01044 "movq "#a", "#t" \n\t"\
01045 "psrlq $16, "#a" \n\t"\
01046 "paddusw "#t", "#a" \n\t"\
01047 "movd "#a", "#dst" \n\t"\
01048
01049 #define HSUM_MMX2(a, t, dst)\
01050 "pshufw $0x0E, "#a", "#t" \n\t"\
01051 "paddusw "#t", "#a" \n\t"\
01052 "pshufw $0x01, "#a", "#t" \n\t"\
01053 "paddusw "#t", "#a" \n\t"\
01054 "movd "#a", "#dst" \n\t"\
01055
01056 #define HSUM_SSE2(a, t, dst)\
01057 "movhlps "#a", "#t" \n\t"\
01058 "paddusw "#t", "#a" \n\t"\
01059 "pshuflw $0x0E, "#a", "#t" \n\t"\
01060 "paddusw "#t", "#a" \n\t"\
01061 "pshuflw $0x01, "#a", "#t" \n\t"\
01062 "paddusw "#t", "#a" \n\t"\
01063 "movd "#a", "#dst" \n\t"\
01064
01065 #define HADAMARD8_DIFF_MMX(cpu) \
01066 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01067 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
01068 int sum;\
01069 \
01070 assert(h==8);\
01071 \
01072 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
01073 \
01074 __asm__ volatile(\
01075 HADAMARD48\
01076 \
01077 "movq %%mm7, 96(%1) \n\t"\
01078 \
01079 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01080 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
01081 \
01082 "movq 96(%1), %%mm7 \n\t"\
01083 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01084 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
01085 \
01086 : "=r" (sum)\
01087 : "r"(temp)\
01088 );\
01089 \
01090 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
01091 \
01092 __asm__ volatile(\
01093 HADAMARD48\
01094 \
01095 "movq %%mm7, 96(%1) \n\t"\
01096 \
01097 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01098 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
01099 \
01100 "movq 96(%1), %%mm7 \n\t"\
01101 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01102 "movq %%mm7, %%mm5 \n\t"\
01103 "movq %%mm6, %%mm7 \n\t"\
01104 "movq %%mm0, %%mm6 \n\t"\
01105 \
01106 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
01107 \
01108 HADAMARD48\
01109 "movq %%mm7, 64(%1) \n\t"\
01110 MMABS(%%mm0, %%mm7)\
01111 MMABS(%%mm1, %%mm7)\
01112 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01113 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01114 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01115 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01116 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01117 "movq 64(%1), %%mm2 \n\t"\
01118 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01119 "paddusw %%mm1, %%mm0 \n\t"\
01120 "movq %%mm0, 64(%1) \n\t"\
01121 \
01122 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
01123 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
01124 \
01125 HADAMARD48\
01126 "movq %%mm7, (%1) \n\t"\
01127 MMABS(%%mm0, %%mm7)\
01128 MMABS(%%mm1, %%mm7)\
01129 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01130 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01131 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01132 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01133 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01134 "movq (%1), %%mm2 \n\t"\
01135 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01136 "paddusw 64(%1), %%mm0 \n\t"\
01137 "paddusw %%mm1, %%mm0 \n\t"\
01138 \
01139 HSUM(%%mm0, %%mm1, %0)\
01140 \
01141 : "=r" (sum)\
01142 : "r"(temp)\
01143 );\
01144 return sum&0xFFFF;\
01145 }\
01146 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01147
01148 #define HADAMARD8_DIFF_SSE2(cpu) \
01149 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01150 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
01151 int sum;\
01152 \
01153 assert(h==8);\
01154 \
01155 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
01156 \
01157 __asm__ volatile(\
01158 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
01159 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
01160 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
01161 MMABS_SUM_8x8\
01162 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
01163 : "=r" (sum)\
01164 : "r"(temp)\
01165 );\
01166 return sum&0xFFFF;\
01167 }\
01168 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01169
01170 #define MMABS(a,z) MMABS_MMX(a,z)
01171 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01172 HADAMARD8_DIFF_MMX(mmx)
01173 #undef MMABS
01174 #undef HSUM
01175
01176 #define MMABS(a,z) MMABS_MMX2(a,z)
01177 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
01178 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01179 HADAMARD8_DIFF_MMX(mmx2)
01180 HADAMARD8_DIFF_SSE2(sse2)
01181 #undef MMABS
01182 #undef MMABS_SUM_8x8
01183 #undef HSUM
01184
01185 #if HAVE_SSSE3
01186 #define MMABS(a,z) MMABS_SSSE3(a,z)
01187 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
01188 HADAMARD8_DIFF_SSE2(ssse3)
01189 #undef MMABS
01190 #undef MMABS_SUM_8x8
01191 #endif
01192
01193 #define DCT_SAD4(m,mm,o)\
01194 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
01195 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
01196 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
01197 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
01198 MMABS_SUM(mm##2, mm##6, mm##0)\
01199 MMABS_SUM(mm##3, mm##7, mm##1)\
01200 MMABS_SUM(mm##4, mm##6, mm##0)\
01201 MMABS_SUM(mm##5, mm##7, mm##1)\
01202
01203 #define DCT_SAD_MMX\
01204 "pxor %%mm0, %%mm0 \n\t"\
01205 "pxor %%mm1, %%mm1 \n\t"\
01206 DCT_SAD4(q, %%mm, 0)\
01207 DCT_SAD4(q, %%mm, 8)\
01208 DCT_SAD4(q, %%mm, 64)\
01209 DCT_SAD4(q, %%mm, 72)\
01210 "paddusw %%mm1, %%mm0 \n\t"\
01211 HSUM(%%mm0, %%mm1, %0)
01212
01213 #define DCT_SAD_SSE2\
01214 "pxor %%xmm0, %%xmm0 \n\t"\
01215 "pxor %%xmm1, %%xmm1 \n\t"\
01216 DCT_SAD4(dqa, %%xmm, 0)\
01217 DCT_SAD4(dqa, %%xmm, 64)\
01218 "paddusw %%xmm1, %%xmm0 \n\t"\
01219 HSUM(%%xmm0, %%xmm1, %0)
01220
01221 #define DCT_SAD_FUNC(cpu) \
01222 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
01223 int sum;\
01224 __asm__ volatile(\
01225 DCT_SAD\
01226 :"=r"(sum)\
01227 :"r"(block)\
01228 );\
01229 return sum&0xFFFF;\
01230 }
01231
01232 #define DCT_SAD DCT_SAD_MMX
01233 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01234 #define MMABS(a,z) MMABS_MMX(a,z)
01235 DCT_SAD_FUNC(mmx)
01236 #undef MMABS
01237 #undef HSUM
01238
01239 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01240 #define MMABS(a,z) MMABS_MMX2(a,z)
01241 DCT_SAD_FUNC(mmx2)
01242 #undef HSUM
01243 #undef DCT_SAD
01244
01245 #define DCT_SAD DCT_SAD_SSE2
01246 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
01247 DCT_SAD_FUNC(sse2)
01248 #undef MMABS
01249
01250 #if HAVE_SSSE3
01251 #define MMABS(a,z) MMABS_SSSE3(a,z)
01252 DCT_SAD_FUNC(ssse3)
01253 #undef MMABS
01254 #endif
01255 #undef HSUM
01256 #undef DCT_SAD
01257
01258 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
01259 int sum;
01260 x86_reg i=size;
01261 __asm__ volatile(
01262 "pxor %%mm4, %%mm4 \n"
01263 "1: \n"
01264 "sub $8, %0 \n"
01265 "movq (%2,%0), %%mm2 \n"
01266 "movq (%3,%0,2), %%mm0 \n"
01267 "movq 8(%3,%0,2), %%mm1 \n"
01268 "punpckhbw %%mm2, %%mm3 \n"
01269 "punpcklbw %%mm2, %%mm2 \n"
01270 "psraw $8, %%mm3 \n"
01271 "psraw $8, %%mm2 \n"
01272 "psubw %%mm3, %%mm1 \n"
01273 "psubw %%mm2, %%mm0 \n"
01274 "pmaddwd %%mm1, %%mm1 \n"
01275 "pmaddwd %%mm0, %%mm0 \n"
01276 "paddd %%mm1, %%mm4 \n"
01277 "paddd %%mm0, %%mm4 \n"
01278 "jg 1b \n"
01279 "movq %%mm4, %%mm3 \n"
01280 "psrlq $32, %%mm3 \n"
01281 "paddd %%mm3, %%mm4 \n"
01282 "movd %%mm4, %1 \n"
01283 :"+r"(i), "=r"(sum)
01284 :"r"(pix1), "r"(pix2)
01285 );
01286 return sum;
01287 }
01288
01289 #define PHADDD(a, t)\
01290 "movq "#a", "#t" \n\t"\
01291 "psrlq $32, "#a" \n\t"\
01292 "paddd "#t", "#a" \n\t"
01293
01294
01295
01296
01297
01298 #define PMULHRW(x, y, s, o)\
01299 "pmulhw " #s ", "#x " \n\t"\
01300 "pmulhw " #s ", "#y " \n\t"\
01301 "paddw " #o ", "#x " \n\t"\
01302 "paddw " #o ", "#y " \n\t"\
01303 "psraw $1, "#x " \n\t"\
01304 "psraw $1, "#y " \n\t"
01305 #define DEF(x) x ## _mmx
01306 #define SET_RND MOVQ_WONE
01307 #define SCALE_OFFSET 1
01308
01309 #include "dsputil_mmx_qns_template.c"
01310
01311 #undef DEF
01312 #undef SET_RND
01313 #undef SCALE_OFFSET
01314 #undef PMULHRW
01315
01316 #define DEF(x) x ## _3dnow
01317 #define SET_RND(x)
01318 #define SCALE_OFFSET 0
01319 #define PMULHRW(x, y, s, o)\
01320 "pmulhrw " #s ", "#x " \n\t"\
01321 "pmulhrw " #s ", "#y " \n\t"
01322
01323 #include "dsputil_mmx_qns_template.c"
01324
01325 #undef DEF
01326 #undef SET_RND
01327 #undef SCALE_OFFSET
01328 #undef PMULHRW
01329
01330 #if HAVE_SSSE3
01331 #undef PHADDD
01332 #define DEF(x) x ## _ssse3
01333 #define SET_RND(x)
01334 #define SCALE_OFFSET -1
01335 #define PHADDD(a, t)\
01336 "pshufw $0x0E, "#a", "#t" \n\t"\
01337 "paddd "#t", "#a" \n\t"
01338 #define PMULHRW(x, y, s, o)\
01339 "pmulhrsw " #s ", "#x " \n\t"\
01340 "pmulhrsw " #s ", "#y " \n\t"
01341
01342 #include "dsputil_mmx_qns_template.c"
01343
01344 #undef DEF
01345 #undef SET_RND
01346 #undef SCALE_OFFSET
01347 #undef PMULHRW
01348 #undef PHADDD
01349 #endif //HAVE_SSSE3
01350
01351
01352
01353 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
01354 double *autoc);
01355
01356
01357 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
01358 {
01359 if (mm_flags & FF_MM_MMX) {
01360 const int dct_algo = avctx->dct_algo;
01361 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
01362 if(mm_flags & FF_MM_SSE2){
01363 c->fdct = ff_fdct_sse2;
01364 }else if(mm_flags & FF_MM_MMXEXT){
01365 c->fdct = ff_fdct_mmx2;
01366 }else{
01367 c->fdct = ff_fdct_mmx;
01368 }
01369 }
01370
01371 c->get_pixels = get_pixels_mmx;
01372 c->diff_pixels = diff_pixels_mmx;
01373 c->pix_sum = pix_sum16_mmx;
01374
01375 c->diff_bytes= diff_bytes_mmx;
01376 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
01377
01378 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
01379 c->hadamard8_diff[1]= hadamard8_diff_mmx;
01380
01381 c->pix_norm1 = pix_norm1_mmx;
01382 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
01383 c->sse[1] = sse8_mmx;
01384 c->vsad[4]= vsad_intra16_mmx;
01385
01386 c->nsse[0] = nsse16_mmx;
01387 c->nsse[1] = nsse8_mmx;
01388 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01389 c->vsad[0] = vsad16_mmx;
01390 }
01391
01392 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01393 c->try_8x8basis= try_8x8basis_mmx;
01394 }
01395 c->add_8x8basis= add_8x8basis_mmx;
01396
01397 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
01398
01399
01400 if (mm_flags & FF_MM_MMXEXT) {
01401 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
01402 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
01403 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
01404 c->vsad[4]= vsad_intra16_mmx2;
01405
01406 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01407 c->vsad[0] = vsad16_mmx2;
01408 }
01409
01410 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
01411 }
01412
01413 if(mm_flags & FF_MM_SSE2){
01414 c->get_pixels = get_pixels_sse2;
01415 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
01416 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
01417 c->hadamard8_diff[1]= hadamard8_diff_sse2;
01418 if (CONFIG_FLAC_ENCODER)
01419 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
01420 }
01421
01422 #if HAVE_SSSE3
01423 if(mm_flags & FF_MM_SSSE3){
01424 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01425 c->try_8x8basis= try_8x8basis_ssse3;
01426 }
01427 c->add_8x8basis= add_8x8basis_ssse3;
01428 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
01429 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
01430 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
01431 }
01432 #endif
01433
01434 if(mm_flags & FF_MM_3DNOW){
01435 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01436 c->try_8x8basis= try_8x8basis_3dnow;
01437 }
01438 c->add_8x8basis= add_8x8basis_3dnow;
01439 }
01440 }
01441
01442 dsputil_init_pix_mmx(c, avctx);
01443 }