00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/avcodec.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "dsputil_mmx.h"
00031
00032 extern uint16_t inv_zigzag_direct16[64];
00033
00034
00035 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00036 DCTELEM *block, int n, int qscale)
00037 {
00038 x86_reg level, qmul, qadd, nCoeffs;
00039
00040 qmul = qscale << 1;
00041
00042 assert(s->block_last_index[n]>=0 || s->h263_aic);
00043
00044 if (!s->h263_aic) {
00045 if (n < 4)
00046 level = block[0] * s->y_dc_scale;
00047 else
00048 level = block[0] * s->c_dc_scale;
00049 qadd = (qscale - 1) | 1;
00050 }else{
00051 qadd = 0;
00052 level= block[0];
00053 }
00054 if(s->ac_pred)
00055 nCoeffs=63;
00056 else
00057 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00058
00059 __asm__ volatile(
00060 "movd %1, %%mm6 \n\t"
00061 "packssdw %%mm6, %%mm6 \n\t"
00062 "packssdw %%mm6, %%mm6 \n\t"
00063 "movd %2, %%mm5 \n\t"
00064 "pxor %%mm7, %%mm7 \n\t"
00065 "packssdw %%mm5, %%mm5 \n\t"
00066 "packssdw %%mm5, %%mm5 \n\t"
00067 "psubw %%mm5, %%mm7 \n\t"
00068 "pxor %%mm4, %%mm4 \n\t"
00069 ".p2align 4 \n\t"
00070 "1: \n\t"
00071 "movq (%0, %3), %%mm0 \n\t"
00072 "movq 8(%0, %3), %%mm1 \n\t"
00073
00074 "pmullw %%mm6, %%mm0 \n\t"
00075 "pmullw %%mm6, %%mm1 \n\t"
00076
00077 "movq (%0, %3), %%mm2 \n\t"
00078 "movq 8(%0, %3), %%mm3 \n\t"
00079
00080 "pcmpgtw %%mm4, %%mm2 \n\t"
00081 "pcmpgtw %%mm4, %%mm3 \n\t"
00082
00083 "pxor %%mm2, %%mm0 \n\t"
00084 "pxor %%mm3, %%mm1 \n\t"
00085
00086 "paddw %%mm7, %%mm0 \n\t"
00087 "paddw %%mm7, %%mm1 \n\t"
00088
00089 "pxor %%mm0, %%mm2 \n\t"
00090 "pxor %%mm1, %%mm3 \n\t"
00091
00092 "pcmpeqw %%mm7, %%mm0 \n\t"
00093 "pcmpeqw %%mm7, %%mm1 \n\t"
00094
00095 "pandn %%mm2, %%mm0 \n\t"
00096 "pandn %%mm3, %%mm1 \n\t"
00097
00098 "movq %%mm0, (%0, %3) \n\t"
00099 "movq %%mm1, 8(%0, %3) \n\t"
00100
00101 "add $16, %3 \n\t"
00102 "jng 1b \n\t"
00103 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00104 : "memory"
00105 );
00106 block[0]= level;
00107 }
00108
00109
00110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00111 DCTELEM *block, int n, int qscale)
00112 {
00113 x86_reg qmul, qadd, nCoeffs;
00114
00115 qmul = qscale << 1;
00116 qadd = (qscale - 1) | 1;
00117
00118 assert(s->block_last_index[n]>=0 || s->h263_aic);
00119
00120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00121
00122 __asm__ volatile(
00123 "movd %1, %%mm6 \n\t"
00124 "packssdw %%mm6, %%mm6 \n\t"
00125 "packssdw %%mm6, %%mm6 \n\t"
00126 "movd %2, %%mm5 \n\t"
00127 "pxor %%mm7, %%mm7 \n\t"
00128 "packssdw %%mm5, %%mm5 \n\t"
00129 "packssdw %%mm5, %%mm5 \n\t"
00130 "psubw %%mm5, %%mm7 \n\t"
00131 "pxor %%mm4, %%mm4 \n\t"
00132 ".p2align 4 \n\t"
00133 "1: \n\t"
00134 "movq (%0, %3), %%mm0 \n\t"
00135 "movq 8(%0, %3), %%mm1 \n\t"
00136
00137 "pmullw %%mm6, %%mm0 \n\t"
00138 "pmullw %%mm6, %%mm1 \n\t"
00139
00140 "movq (%0, %3), %%mm2 \n\t"
00141 "movq 8(%0, %3), %%mm3 \n\t"
00142
00143 "pcmpgtw %%mm4, %%mm2 \n\t"
00144 "pcmpgtw %%mm4, %%mm3 \n\t"
00145
00146 "pxor %%mm2, %%mm0 \n\t"
00147 "pxor %%mm3, %%mm1 \n\t"
00148
00149 "paddw %%mm7, %%mm0 \n\t"
00150 "paddw %%mm7, %%mm1 \n\t"
00151
00152 "pxor %%mm0, %%mm2 \n\t"
00153 "pxor %%mm1, %%mm3 \n\t"
00154
00155 "pcmpeqw %%mm7, %%mm0 \n\t"
00156 "pcmpeqw %%mm7, %%mm1 \n\t"
00157
00158 "pandn %%mm2, %%mm0 \n\t"
00159 "pandn %%mm3, %%mm1 \n\t"
00160
00161 "movq %%mm0, (%0, %3) \n\t"
00162 "movq %%mm1, 8(%0, %3) \n\t"
00163
00164 "add $16, %3 \n\t"
00165 "jng 1b \n\t"
00166 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00167 : "memory"
00168 );
00169 }
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00202 DCTELEM *block, int n, int qscale)
00203 {
00204 x86_reg nCoeffs;
00205 const uint16_t *quant_matrix;
00206 int block0;
00207
00208 assert(s->block_last_index[n]>=0);
00209
00210 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00211
00212 if (n < 4)
00213 block0 = block[0] * s->y_dc_scale;
00214 else
00215 block0 = block[0] * s->c_dc_scale;
00216
00217 quant_matrix = s->intra_matrix;
00218 __asm__ volatile(
00219 "pcmpeqw %%mm7, %%mm7 \n\t"
00220 "psrlw $15, %%mm7 \n\t"
00221 "movd %2, %%mm6 \n\t"
00222 "packssdw %%mm6, %%mm6 \n\t"
00223 "packssdw %%mm6, %%mm6 \n\t"
00224 "mov %3, %%"REG_a" \n\t"
00225 ".p2align 4 \n\t"
00226 "1: \n\t"
00227 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00229 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00231 "pmullw %%mm6, %%mm4 \n\t"
00232 "pmullw %%mm6, %%mm5 \n\t"
00233 "pxor %%mm2, %%mm2 \n\t"
00234 "pxor %%mm3, %%mm3 \n\t"
00235 "pcmpgtw %%mm0, %%mm2 \n\t"
00236 "pcmpgtw %%mm1, %%mm3 \n\t"
00237 "pxor %%mm2, %%mm0 \n\t"
00238 "pxor %%mm3, %%mm1 \n\t"
00239 "psubw %%mm2, %%mm0 \n\t"
00240 "psubw %%mm3, %%mm1 \n\t"
00241 "pmullw %%mm4, %%mm0 \n\t"
00242 "pmullw %%mm5, %%mm1 \n\t"
00243 "pxor %%mm4, %%mm4 \n\t"
00244 "pxor %%mm5, %%mm5 \n\t"
00245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00247 "psraw $3, %%mm0 \n\t"
00248 "psraw $3, %%mm1 \n\t"
00249 "psubw %%mm7, %%mm0 \n\t"
00250 "psubw %%mm7, %%mm1 \n\t"
00251 "por %%mm7, %%mm0 \n\t"
00252 "por %%mm7, %%mm1 \n\t"
00253 "pxor %%mm2, %%mm0 \n\t"
00254 "pxor %%mm3, %%mm1 \n\t"
00255 "psubw %%mm2, %%mm0 \n\t"
00256 "psubw %%mm3, %%mm1 \n\t"
00257 "pandn %%mm0, %%mm4 \n\t"
00258 "pandn %%mm1, %%mm5 \n\t"
00259 "movq %%mm4, (%0, %%"REG_a") \n\t"
00260 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00261
00262 "add $16, %%"REG_a" \n\t"
00263 "js 1b \n\t"
00264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00265 : "%"REG_a, "memory"
00266 );
00267 block[0]= block0;
00268 }
00269
00270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00271 DCTELEM *block, int n, int qscale)
00272 {
00273 x86_reg nCoeffs;
00274 const uint16_t *quant_matrix;
00275
00276 assert(s->block_last_index[n]>=0);
00277
00278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00279
00280 quant_matrix = s->inter_matrix;
00281 __asm__ volatile(
00282 "pcmpeqw %%mm7, %%mm7 \n\t"
00283 "psrlw $15, %%mm7 \n\t"
00284 "movd %2, %%mm6 \n\t"
00285 "packssdw %%mm6, %%mm6 \n\t"
00286 "packssdw %%mm6, %%mm6 \n\t"
00287 "mov %3, %%"REG_a" \n\t"
00288 ".p2align 4 \n\t"
00289 "1: \n\t"
00290 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00292 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00294 "pmullw %%mm6, %%mm4 \n\t"
00295 "pmullw %%mm6, %%mm5 \n\t"
00296 "pxor %%mm2, %%mm2 \n\t"
00297 "pxor %%mm3, %%mm3 \n\t"
00298 "pcmpgtw %%mm0, %%mm2 \n\t"
00299 "pcmpgtw %%mm1, %%mm3 \n\t"
00300 "pxor %%mm2, %%mm0 \n\t"
00301 "pxor %%mm3, %%mm1 \n\t"
00302 "psubw %%mm2, %%mm0 \n\t"
00303 "psubw %%mm3, %%mm1 \n\t"
00304 "paddw %%mm0, %%mm0 \n\t"
00305 "paddw %%mm1, %%mm1 \n\t"
00306 "paddw %%mm7, %%mm0 \n\t"
00307 "paddw %%mm7, %%mm1 \n\t"
00308 "pmullw %%mm4, %%mm0 \n\t"
00309 "pmullw %%mm5, %%mm1 \n\t"
00310 "pxor %%mm4, %%mm4 \n\t"
00311 "pxor %%mm5, %%mm5 \n\t"
00312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00314 "psraw $4, %%mm0 \n\t"
00315 "psraw $4, %%mm1 \n\t"
00316 "psubw %%mm7, %%mm0 \n\t"
00317 "psubw %%mm7, %%mm1 \n\t"
00318 "por %%mm7, %%mm0 \n\t"
00319 "por %%mm7, %%mm1 \n\t"
00320 "pxor %%mm2, %%mm0 \n\t"
00321 "pxor %%mm3, %%mm1 \n\t"
00322 "psubw %%mm2, %%mm0 \n\t"
00323 "psubw %%mm3, %%mm1 \n\t"
00324 "pandn %%mm0, %%mm4 \n\t"
00325 "pandn %%mm1, %%mm5 \n\t"
00326 "movq %%mm4, (%0, %%"REG_a") \n\t"
00327 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00328
00329 "add $16, %%"REG_a" \n\t"
00330 "js 1b \n\t"
00331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00332 : "%"REG_a, "memory"
00333 );
00334 }
00335
00336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00337 DCTELEM *block, int n, int qscale)
00338 {
00339 x86_reg nCoeffs;
00340 const uint16_t *quant_matrix;
00341 int block0;
00342
00343 assert(s->block_last_index[n]>=0);
00344
00345 if(s->alternate_scan) nCoeffs= 63;
00346 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00347
00348 if (n < 4)
00349 block0 = block[0] * s->y_dc_scale;
00350 else
00351 block0 = block[0] * s->c_dc_scale;
00352 quant_matrix = s->intra_matrix;
00353 __asm__ volatile(
00354 "pcmpeqw %%mm7, %%mm7 \n\t"
00355 "psrlw $15, %%mm7 \n\t"
00356 "movd %2, %%mm6 \n\t"
00357 "packssdw %%mm6, %%mm6 \n\t"
00358 "packssdw %%mm6, %%mm6 \n\t"
00359 "mov %3, %%"REG_a" \n\t"
00360 ".p2align 4 \n\t"
00361 "1: \n\t"
00362 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00364 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00366 "pmullw %%mm6, %%mm4 \n\t"
00367 "pmullw %%mm6, %%mm5 \n\t"
00368 "pxor %%mm2, %%mm2 \n\t"
00369 "pxor %%mm3, %%mm3 \n\t"
00370 "pcmpgtw %%mm0, %%mm2 \n\t"
00371 "pcmpgtw %%mm1, %%mm3 \n\t"
00372 "pxor %%mm2, %%mm0 \n\t"
00373 "pxor %%mm3, %%mm1 \n\t"
00374 "psubw %%mm2, %%mm0 \n\t"
00375 "psubw %%mm3, %%mm1 \n\t"
00376 "pmullw %%mm4, %%mm0 \n\t"
00377 "pmullw %%mm5, %%mm1 \n\t"
00378 "pxor %%mm4, %%mm4 \n\t"
00379 "pxor %%mm5, %%mm5 \n\t"
00380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00382 "psraw $3, %%mm0 \n\t"
00383 "psraw $3, %%mm1 \n\t"
00384 "pxor %%mm2, %%mm0 \n\t"
00385 "pxor %%mm3, %%mm1 \n\t"
00386 "psubw %%mm2, %%mm0 \n\t"
00387 "psubw %%mm3, %%mm1 \n\t"
00388 "pandn %%mm0, %%mm4 \n\t"
00389 "pandn %%mm1, %%mm5 \n\t"
00390 "movq %%mm4, (%0, %%"REG_a") \n\t"
00391 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00392
00393 "add $16, %%"REG_a" \n\t"
00394 "jng 1b \n\t"
00395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00396 : "%"REG_a, "memory"
00397 );
00398 block[0]= block0;
00399
00400 }
00401
00402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00403 DCTELEM *block, int n, int qscale)
00404 {
00405 x86_reg nCoeffs;
00406 const uint16_t *quant_matrix;
00407
00408 assert(s->block_last_index[n]>=0);
00409
00410 if(s->alternate_scan) nCoeffs= 63;
00411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00412
00413 quant_matrix = s->inter_matrix;
00414 __asm__ volatile(
00415 "pcmpeqw %%mm7, %%mm7 \n\t"
00416 "psrlq $48, %%mm7 \n\t"
00417 "movd %2, %%mm6 \n\t"
00418 "packssdw %%mm6, %%mm6 \n\t"
00419 "packssdw %%mm6, %%mm6 \n\t"
00420 "mov %3, %%"REG_a" \n\t"
00421 ".p2align 4 \n\t"
00422 "1: \n\t"
00423 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00425 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00427 "pmullw %%mm6, %%mm4 \n\t"
00428 "pmullw %%mm6, %%mm5 \n\t"
00429 "pxor %%mm2, %%mm2 \n\t"
00430 "pxor %%mm3, %%mm3 \n\t"
00431 "pcmpgtw %%mm0, %%mm2 \n\t"
00432 "pcmpgtw %%mm1, %%mm3 \n\t"
00433 "pxor %%mm2, %%mm0 \n\t"
00434 "pxor %%mm3, %%mm1 \n\t"
00435 "psubw %%mm2, %%mm0 \n\t"
00436 "psubw %%mm3, %%mm1 \n\t"
00437 "paddw %%mm0, %%mm0 \n\t"
00438 "paddw %%mm1, %%mm1 \n\t"
00439 "pmullw %%mm4, %%mm0 \n\t"
00440 "pmullw %%mm5, %%mm1 \n\t"
00441 "paddw %%mm4, %%mm0 \n\t"
00442 "paddw %%mm5, %%mm1 \n\t"
00443 "pxor %%mm4, %%mm4 \n\t"
00444 "pxor %%mm5, %%mm5 \n\t"
00445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00447 "psrlw $4, %%mm0 \n\t"
00448 "psrlw $4, %%mm1 \n\t"
00449 "pxor %%mm2, %%mm0 \n\t"
00450 "pxor %%mm3, %%mm1 \n\t"
00451 "psubw %%mm2, %%mm0 \n\t"
00452 "psubw %%mm3, %%mm1 \n\t"
00453 "pandn %%mm0, %%mm4 \n\t"
00454 "pandn %%mm1, %%mm5 \n\t"
00455 "pxor %%mm4, %%mm7 \n\t"
00456 "pxor %%mm5, %%mm7 \n\t"
00457 "movq %%mm4, (%0, %%"REG_a") \n\t"
00458 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00459
00460 "add $16, %%"REG_a" \n\t"
00461 "jng 1b \n\t"
00462 "movd 124(%0, %3), %%mm0 \n\t"
00463 "movq %%mm7, %%mm6 \n\t"
00464 "psrlq $32, %%mm7 \n\t"
00465 "pxor %%mm6, %%mm7 \n\t"
00466 "movq %%mm7, %%mm6 \n\t"
00467 "psrlq $16, %%mm7 \n\t"
00468 "pxor %%mm6, %%mm7 \n\t"
00469 "pslld $31, %%mm7 \n\t"
00470 "psrlq $15, %%mm7 \n\t"
00471 "pxor %%mm7, %%mm0 \n\t"
00472 "movd %%mm0, 124(%0, %3) \n\t"
00473
00474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
00475 : "%"REG_a, "memory"
00476 );
00477 }
00478
00479 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00480 const int intra= s->mb_intra;
00481 int *sum= s->dct_error_sum[intra];
00482 uint16_t *offset= s->dct_offset[intra];
00483
00484 s->dct_count[intra]++;
00485
00486 __asm__ volatile(
00487 "pxor %%mm7, %%mm7 \n\t"
00488 "1: \n\t"
00489 "pxor %%mm0, %%mm0 \n\t"
00490 "pxor %%mm1, %%mm1 \n\t"
00491 "movq (%0), %%mm2 \n\t"
00492 "movq 8(%0), %%mm3 \n\t"
00493 "pcmpgtw %%mm2, %%mm0 \n\t"
00494 "pcmpgtw %%mm3, %%mm1 \n\t"
00495 "pxor %%mm0, %%mm2 \n\t"
00496 "pxor %%mm1, %%mm3 \n\t"
00497 "psubw %%mm0, %%mm2 \n\t"
00498 "psubw %%mm1, %%mm3 \n\t"
00499 "movq %%mm2, %%mm4 \n\t"
00500 "movq %%mm3, %%mm5 \n\t"
00501 "psubusw (%2), %%mm2 \n\t"
00502 "psubusw 8(%2), %%mm3 \n\t"
00503 "pxor %%mm0, %%mm2 \n\t"
00504 "pxor %%mm1, %%mm3 \n\t"
00505 "psubw %%mm0, %%mm2 \n\t"
00506 "psubw %%mm1, %%mm3 \n\t"
00507 "movq %%mm2, (%0) \n\t"
00508 "movq %%mm3, 8(%0) \n\t"
00509 "movq %%mm4, %%mm2 \n\t"
00510 "movq %%mm5, %%mm3 \n\t"
00511 "punpcklwd %%mm7, %%mm4 \n\t"
00512 "punpckhwd %%mm7, %%mm2 \n\t"
00513 "punpcklwd %%mm7, %%mm5 \n\t"
00514 "punpckhwd %%mm7, %%mm3 \n\t"
00515 "paddd (%1), %%mm4 \n\t"
00516 "paddd 8(%1), %%mm2 \n\t"
00517 "paddd 16(%1), %%mm5 \n\t"
00518 "paddd 24(%1), %%mm3 \n\t"
00519 "movq %%mm4, (%1) \n\t"
00520 "movq %%mm2, 8(%1) \n\t"
00521 "movq %%mm5, 16(%1) \n\t"
00522 "movq %%mm3, 24(%1) \n\t"
00523 "add $16, %0 \n\t"
00524 "add $32, %1 \n\t"
00525 "add $16, %2 \n\t"
00526 "cmp %3, %0 \n\t"
00527 " jb 1b \n\t"
00528 : "+r" (block), "+r" (sum), "+r" (offset)
00529 : "r"(block+64)
00530 );
00531 }
00532
00533 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00534 const int intra= s->mb_intra;
00535 int *sum= s->dct_error_sum[intra];
00536 uint16_t *offset= s->dct_offset[intra];
00537
00538 s->dct_count[intra]++;
00539
00540 __asm__ volatile(
00541 "pxor %%xmm7, %%xmm7 \n\t"
00542 "1: \n\t"
00543 "pxor %%xmm0, %%xmm0 \n\t"
00544 "pxor %%xmm1, %%xmm1 \n\t"
00545 "movdqa (%0), %%xmm2 \n\t"
00546 "movdqa 16(%0), %%xmm3 \n\t"
00547 "pcmpgtw %%xmm2, %%xmm0 \n\t"
00548 "pcmpgtw %%xmm3, %%xmm1 \n\t"
00549 "pxor %%xmm0, %%xmm2 \n\t"
00550 "pxor %%xmm1, %%xmm3 \n\t"
00551 "psubw %%xmm0, %%xmm2 \n\t"
00552 "psubw %%xmm1, %%xmm3 \n\t"
00553 "movdqa %%xmm2, %%xmm4 \n\t"
00554 "movdqa %%xmm3, %%xmm5 \n\t"
00555 "psubusw (%2), %%xmm2 \n\t"
00556 "psubusw 16(%2), %%xmm3 \n\t"
00557 "pxor %%xmm0, %%xmm2 \n\t"
00558 "pxor %%xmm1, %%xmm3 \n\t"
00559 "psubw %%xmm0, %%xmm2 \n\t"
00560 "psubw %%xmm1, %%xmm3 \n\t"
00561 "movdqa %%xmm2, (%0) \n\t"
00562 "movdqa %%xmm3, 16(%0) \n\t"
00563 "movdqa %%xmm4, %%xmm6 \n\t"
00564 "movdqa %%xmm5, %%xmm0 \n\t"
00565 "punpcklwd %%xmm7, %%xmm4 \n\t"
00566 "punpckhwd %%xmm7, %%xmm6 \n\t"
00567 "punpcklwd %%xmm7, %%xmm5 \n\t"
00568 "punpckhwd %%xmm7, %%xmm0 \n\t"
00569 "paddd (%1), %%xmm4 \n\t"
00570 "paddd 16(%1), %%xmm6 \n\t"
00571 "paddd 32(%1), %%xmm5 \n\t"
00572 "paddd 48(%1), %%xmm0 \n\t"
00573 "movdqa %%xmm4, (%1) \n\t"
00574 "movdqa %%xmm6, 16(%1) \n\t"
00575 "movdqa %%xmm5, 32(%1) \n\t"
00576 "movdqa %%xmm0, 48(%1) \n\t"
00577 "add $32, %0 \n\t"
00578 "add $64, %1 \n\t"
00579 "add $32, %2 \n\t"
00580 "cmp %3, %0 \n\t"
00581 " jb 1b \n\t"
00582 : "+r" (block), "+r" (sum), "+r" (offset)
00583 : "r"(block+64)
00584 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
00585 "%xmm4", "%xmm5", "%xmm6", "%xmm7")
00586 );
00587 }
00588
00589 #if HAVE_SSSE3
00590 #define HAVE_SSSE3_BAK
00591 #endif
00592 #undef HAVE_SSSE3
00593 #define HAVE_SSSE3 0
00594
00595 #undef HAVE_SSE2
00596 #undef HAVE_MMX2
00597 #define HAVE_SSE2 0
00598 #define HAVE_MMX2 0
00599 #define RENAME(a) a ## _MMX
00600 #define RENAMEl(a) a ## _mmx
00601 #include "mpegvideo_mmx_template.c"
00602
00603 #undef HAVE_MMX2
00604 #define HAVE_MMX2 1
00605 #undef RENAME
00606 #undef RENAMEl
00607 #define RENAME(a) a ## _MMX2
00608 #define RENAMEl(a) a ## _mmx2
00609 #include "mpegvideo_mmx_template.c"
00610
00611 #undef HAVE_SSE2
00612 #define HAVE_SSE2 1
00613 #undef RENAME
00614 #undef RENAMEl
00615 #define RENAME(a) a ## _SSE2
00616 #define RENAMEl(a) a ## _sse2
00617 #include "mpegvideo_mmx_template.c"
00618
00619 #ifdef HAVE_SSSE3_BAK
00620 #undef HAVE_SSSE3
00621 #define HAVE_SSSE3 1
00622 #undef RENAME
00623 #undef RENAMEl
00624 #define RENAME(a) a ## _SSSE3
00625 #define RENAMEl(a) a ## _sse2
00626 #include "mpegvideo_mmx_template.c"
00627 #endif
00628
00629 void MPV_common_init_mmx(MpegEncContext *s)
00630 {
00631 int mm_flags = av_get_cpu_flags();
00632
00633 if (mm_flags & AV_CPU_FLAG_MMX) {
00634 const int dct_algo = s->avctx->dct_algo;
00635
00636 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00637 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00638 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00639 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00640 if(!(s->flags & CODEC_FLAG_BITEXACT))
00641 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00642 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00643
00644 if (mm_flags & AV_CPU_FLAG_SSE2) {
00645 s->denoise_dct= denoise_dct_sse2;
00646 } else {
00647 s->denoise_dct= denoise_dct_mmx;
00648 }
00649
00650 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00651 #if HAVE_SSSE3
00652 if(mm_flags & AV_CPU_FLAG_SSSE3){
00653 s->dct_quantize= dct_quantize_SSSE3;
00654 } else
00655 #endif
00656 if(mm_flags & AV_CPU_FLAG_SSE2){
00657 s->dct_quantize= dct_quantize_SSE2;
00658 } else if(mm_flags & AV_CPU_FLAG_MMX2){
00659 s->dct_quantize= dct_quantize_MMX2;
00660 } else {
00661 s->dct_quantize= dct_quantize_MMX;
00662 }
00663 }
00664 }
00665 }