27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 
   31     8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
   52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \ 
   53                             filt0, filt1, filt2, filt3)         \ 
   57     tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \ 
   58     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \ 
   59     tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \ 
   60     tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \ 
   61     tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \ 
   66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \ 
   67                         filt_h0, filt_h1, filt_h2, filt_h3)              \ 
   69     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \ 
   72     VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \ 
   73                vec0_m, vec1_m, vec2_m, vec3_m);                          \ 
   74     hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \ 
   75                                    filt_h0, filt_h1, filt_h2, filt_h3);  \ 
   77     hz_out_m = __msa_srari_h(hz_out_m, 7);                               \ 
   78     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \ 
   83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \ 
   84                                    mask0, mask1, mask2, mask3,              \ 
   85                                    filt0, filt1, filt2, filt3,              \ 
   88     v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \ 
   89     v8i16 res0_m, res1_m, res2_m, res3_m;                                   \ 
   91     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \ 
   92     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \ 
   93     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \ 
   94     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \ 
   95     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \ 
   96     DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \ 
   97     VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \ 
   98     DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \ 
   99     ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \ 
  102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \ 
  103                                    mask0, mask1, mask2, mask3,                \ 
  104                                    filt0, filt1, filt2, filt3,                \ 
  105                                    out0, out1, out2, out3)                    \ 
  107     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \ 
  108     v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \ 
  110     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \ 
  111     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \ 
  112     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \ 
  113                 res0_m, res1_m, res2_m, res3_m);                              \ 
  114     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \ 
  115     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \ 
  116     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \ 
  117                 res4_m, res5_m, res6_m, res7_m);                              \ 
  118     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \ 
  119     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \ 
  120     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \ 
  121                  res0_m, res1_m, res2_m, res3_m);                             \ 
  122     VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \ 
  123     VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \ 
  124     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \ 
  125                  res4_m, res5_m, res6_m, res7_m);                             \ 
  126     ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \ 
  127                 res7_m, out0, out1, out2, out3);                              \ 
  130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \ 
  134     tmp_m = PCKEV_XORI128_UB(in1, in0);               \ 
  135     tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \ 
  136     ST_UB(tmp_m, (pdst));                             \ 
  139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \ 
  143     tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \ 
  144     tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \ 
  145     ST_UB(tmp_m, (pdst));                                     \ 
  148 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \ 
  151     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \ 
  152     uint8_t *pdst_m = (uint8_t *) (pdst);                               \ 
  154     PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \ 
  155     PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \ 
  156     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \ 
  157     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \ 
  164     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  165     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  166     v8i16 
filt, out0, out1;
 
  172     filt = 
LD_SH(filter);
 
  173     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  179     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  182                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  186     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  193     v16i8 filt0, filt1, filt2, filt3;
 
  195     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  196     v8i16 
filt, out0, out1, out2, out3;
 
  202     filt = 
LD_SH(filter);
 
  203     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  209     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  211     src += (4 * src_stride);
 
  213                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  214     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  217                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  221     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  222     dst += (4 * dst_stride);
 
  224     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  231     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  232     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  233     v8i16 
filt, out0, out1, out2, out3;
 
  239     filt = 
LD_SH(filter);
 
  240     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  246     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  248     src += (4 * src_stride);
 
  250                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  251     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  253     src += (4 * src_stride);
 
  255                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  259     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  260     dst += (4 * dst_stride);
 
  262     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  263     dst += (4 * dst_stride);
 
  265     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  267     src += (4 * src_stride);
 
  269                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  270     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  272     src += (4 * src_stride);
 
  274                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  279     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  280     dst += (4 * dst_stride);
 
  282     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  291     } 
else if (8 == height) {
 
  293     } 
else if (16 == height) {
 
  302     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  303     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  304     v8i16 
filt, out0, out1, out2, out3;
 
  310     filt = 
LD_SH(filter);
 
  311     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  317     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  320                                mask3, filt0, filt1, filt2, filt3, out0, out1,
 
  326     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
  334     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  335     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  336     v8i16 
filt, out0, out1, out2, out3;
 
  342     filt = 
LD_SH(filter);
 
  343     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  349     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  350         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  352         src += (4 * src_stride);
 
  354                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  360         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
  361         dst += (4 * dst_stride);
 
  382     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  383     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  384     v8i16 
filt, out0, out1, out2, out3;
 
  390     filt = 
LD_SH(filter);
 
  391     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  397     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  398         LD_SB2(src, src_stride, src0, src2);
 
  399         LD_SB2(src + 8, src_stride, src1, src3);
 
  401         src += (2 * src_stride);
 
  403                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  421     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  422     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  423     v8i16 
filt, out0, out1, out2, out3;
 
  429     filt = 
LD_SH(filter);
 
  430     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  436     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  438         src2 = 
LD_SB(src + 16);
 
  439         src3 = 
LD_SB(src + 24);
 
  440         src1 = __msa_sldi_b(src2, src0, 8);
 
  444                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  450         src2 = 
LD_SB(src + 16);
 
  451         src3 = 
LD_SB(src + 24);
 
  452         src1 = __msa_sldi_b(src2, src0, 8);
 
  458         ST_UB(out, dst + 16);
 
  463                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  470         ST_UB(out, dst + 16);
 
  480     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  481     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  482     v8i16 
filt, out0, out1, out2, out3;
 
  488     filt = 
LD_SH(filter);
 
  489     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  495     for (loop_cnt = height; loop_cnt--;) {
 
  497         src2 = 
LD_SB(src + 16);
 
  498         src3 = 
LD_SB(src + 24);
 
  499         src1 = __msa_sldi_b(src2, src0, 8);
 
  503                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  504                                    out0, out1, out2, out3);
 
  510         ST_UB(out, dst + 16);
 
  512         src0 = 
LD_SB(src + 32);
 
  513         src2 = 
LD_SB(src + 48);
 
  514         src3 = 
LD_SB(src + 56);
 
  515         src1 = __msa_sldi_b(src2, src0, 8);
 
  520                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  521                                    out0, out1, out2, out3);
 
  525         ST_UB(out, dst + 32);
 
  527         ST_UB(out, dst + 48);
 
  537     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  538     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  539     v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
 
  540     v16i8 src10998, filt0, filt1, filt2, filt3;
 
  542     v8i16 
filt, out10, out32;
 
  544     src -= (3 * src_stride);
 
  546     filt = 
LD_SH(filter);
 
  547     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  549     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  550     src += (7 * src_stride);
 
  552     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
  554     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  555     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
 
  559     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  560         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
  561         src += (4 * src_stride);
 
  563         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  564                    src87_r, src98_r, src109_r);
 
  565         ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
 
  568                                     filt1, filt2, filt3);
 
  570                                     filt1, filt2, filt3);
 
  574         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  575         dst += (4 * dst_stride);
 
  589     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  590     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  591     v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
 
  593     v8i16 
filt, out0_r, out1_r, out2_r, out3_r;
 
  595     src -= (3 * src_stride);
 
  597     filt = 
LD_SH(filter);
 
  598     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  600     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  602     src += (7 * src_stride);
 
  603     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
  605     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  607     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  608         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
  610         src += (4 * src_stride);
 
  612         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  613                    src87_r, src98_r, src109_r);
 
  615                                      filt1, filt2, filt3);
 
  617                                      filt1, filt2, filt3);
 
  619                                      filt1, filt2, filt3);
 
  621                                      filt1, filt2, filt3);
 
  623         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  626         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
  627         dst += (4 * dst_stride);
 
  644     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  645     v16i8 filt0, filt1, filt2, filt3;
 
  646     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  647     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
  648     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
  649     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  650     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
  652     src -= (3 * src_stride);
 
  654     filt = 
LD_SH(filter);
 
  655     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  657     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  659     src += (7 * src_stride);
 
  660     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
  662     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  663     ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
 
  665     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
  667     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  668         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
  670         src += (4 * src_stride);
 
  672         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  673                    src87_r, src98_r, src109_r);
 
  674         ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
  675                    src87_l, src98_l, src109_l);
 
  677                                      filt1, filt2, filt3);
 
  679                                      filt1, filt2, filt3);
 
  681                                      filt1, filt2, filt3);
 
  683                                      filt1, filt2, filt3);
 
  685                                      filt1, filt2, filt3);
 
  687                                      filt1, filt2, filt3);
 
  689                                      filt1, filt2, filt3);
 
  691                                      filt1, filt2, filt3);
 
  694         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  695         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
  696         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
  697                     out3_r, tmp0, tmp1, tmp2, tmp3);
 
  699         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 
  700         dst += (4 * dst_stride);
 
  725     uint32_t loop_cnt, cnt;
 
  726     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  727     v16i8 filt0, filt1, filt2, filt3;
 
  728     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  729     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
  730     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
  731     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  732     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
  734     src -= (3 * src_stride);
 
  736     filt = 
LD_SH(filter);
 
  737     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  739     for (cnt = (width >> 4); cnt--;) {
 
  743         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  745         src_tmp += (7 * src_stride);
 
  746         ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
 
  747                    src32_r, src54_r, src21_r);
 
  748         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
  749         ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
 
  750                    src32_l, src54_l, src21_l);
 
  751         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
  753         for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  754             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
  756             src_tmp += (4 * src_stride);
 
  757             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
  758                        src87_r, src98_r, src109_r);
 
  759             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
  760                        src87_l, src98_l, src109_l);
 
  762                                          filt0, filt1, filt2, filt3);
 
  764                                          filt0, filt1, filt2, filt3);
 
  766                                          filt0, filt1, filt2, filt3);
 
  768                                          filt0, filt1, filt2, filt3);
 
  770                                          filt0, filt1, filt2, filt3);
 
  772                                          filt0, filt1, filt2, filt3);
 
  774                                          filt0, filt1, filt2, filt3);
 
  776                                          filt0, filt1, filt2, filt3);
 
  779             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
  780             SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
  781             PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
  782                         out3_r, tmp0, tmp1, tmp2, tmp3);
 
  784             ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
 
  785             dst_tmp += (4 * dst_stride);
 
  825                                      const int8_t *filter_horiz,
 
  826                                      const int8_t *filter_vert,
 
  830     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  831     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
  832     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  833     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
  834     v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
 
  835     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
  838     src -= (3 + 3 * src_stride);
 
  841     filt = 
LD_SH(filter_horiz);
 
  842     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  848     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  850     src += (7 * src_stride);
 
  852     hz_out0 = 
HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
 
  853                               filt_hz1, filt_hz2, filt_hz3);
 
  854     hz_out2 = 
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
  855                               filt_hz1, filt_hz2, filt_hz3);
 
  856     hz_out4 = 
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
  857                               filt_hz1, filt_hz2, filt_hz3);
 
  858     hz_out5 = 
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
  859                               filt_hz1, filt_hz2, filt_hz3);
 
  860     SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
 
  862     filt = 
LD_SH(filter_vert);
 
  863     SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
  865     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
  866     out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
 
  868     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  869         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
  871         src += (4 * src_stride);
 
  874                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  875         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
 
  876         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
  881                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  882         hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
 
  883         out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
  889         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  890         dst += (4 * dst_stride);
 
  901                                      const int8_t *filter_horiz,
 
  902                                      const int8_t *filter_vert,
 
  906     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  907     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
  908     v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
 
  909     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
  910     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
  911     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
 
  912     v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
 
  915     src -= (3 + 3 * src_stride);
 
  918     filt = 
LD_SH(filter_horiz);
 
  919     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  925     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  926     src += (7 * src_stride);
 
  929     hz_out0 = 
HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
 
  930                               filt_hz1, filt_hz2, filt_hz3);
 
  931     hz_out1 = 
HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
 
  932                               filt_hz1, filt_hz2, filt_hz3);
 
  933     hz_out2 = 
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
 
  934                               filt_hz1, filt_hz2, filt_hz3);
 
  935     hz_out3 = 
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
  936                               filt_hz1, filt_hz2, filt_hz3);
 
  937     hz_out4 = 
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
 
  938                               filt_hz1, filt_hz2, filt_hz3);
 
  939     hz_out5 = 
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
  940                               filt_hz1, filt_hz2, filt_hz3);
 
  941     hz_out6 = 
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
  942                               filt_hz1, filt_hz2, filt_hz3);
 
  944     filt = 
LD_SH(filter_vert);
 
  945     SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
  947     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
  948     ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
 
  949     ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
 
  951     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  952         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
  953         src += (4 * src_stride);
 
  958                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  959         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
  964                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  965         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
 
  970                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  971         out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
  973                                    filt_vt1, filt_vt2, filt_vt3);
 
  976                                    filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
  977         out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
 
  984         ST8x4_UB(vec0, vec1, dst, dst_stride);
 
  985         dst += (4 * dst_stride);
 
  999                                       const int8_t *filter_horiz,
 
 1000                                       const int8_t *filter_vert,
 
 1005     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 1007                                  filter_vert, height);
 
 1016                                       const int8_t *filter_horiz,
 
 1017                                       const int8_t *filter_vert,
 
 1022     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 1024                                  filter_vert, height);
 
 1033                                       const int8_t *filter_horiz,
 
 1034                                       const int8_t *filter_vert,
 
 1039     for (multiple8_cnt = 8; multiple8_cnt--;) {
 
 1041                                  filter_vert, height);
 
 1053     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1054     v16u8 dst0, dst1, dst2, dst3, res2, res3;
 
 1055     v16u8 mask0, mask1, mask2, mask3;
 
 1056     v8i16 
filt, res0, res1;
 
 1062     filt = 
LD_SH(filter);
 
 1063     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1069     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1072                                mask3, filt0, filt1, filt2, filt3, res0, res1);
 
 1073     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1077     ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
 1080     ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 
 1088     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1089     v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
 
 1090     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1091     v8i16 
filt, vec0, vec1, vec2, vec3;
 
 1097     filt = 
LD_SH(filter);
 
 1098     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1104     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1106     src += (4 * src_stride);
 
 1107     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 1109                                mask3, filt0, filt1, filt2, filt3, vec0, vec1);
 
 1110     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1113                                mask3, filt0, filt1, filt2, filt3, vec2, vec3);
 
 1116     PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
 
 1117                 res0, res1, res2, res3);
 
 1118     ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
 
 1120     ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 1121                dst0, dst2, dst4, dst6);
 
 1122     ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
 
 1124     ST4x8_UB(res0, res2, dst, dst_stride);
 
 1136     } 
else if (8 == height) {
 
 1149     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1150     v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
 
 1151     v8i16 
filt, out0, out1, out2, out3;
 
 1157     filt = 
LD_SH(filter);
 
 1158     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1164     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1165         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1167         src += (4 * src_stride);
 
 1169                                    mask3, filt0, filt1, filt2, filt3, out0,
 
 1171         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1176         dst += (4 * dst_stride);
 
 1187     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1188     v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
 
 1189     v8i16 
filt, out0, out1, out2, out3;
 
 1190     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1191     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1197     filt = 
LD_SH(filter);
 
 1198     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1204     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1205         LD_SB2(src, src_stride, src0, src2);
 
 1206         LD_SB2(src + 8, src_stride, src1, src3);
 
 1207         src += (2 * src_stride);
 
 1210         VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
 
 1212         VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
 
 1214         VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
 
 1216         VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
 
 1218         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 1220         DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
 
 1221                     vec9, vec10, vec11);
 
 1222         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
 
 1224         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1225                      vec8, vec9, vec10, vec11);
 
 1226         ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1228         LD_UB2(dst, dst_stride, dst0, dst1);
 
 1245     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1246     v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 
 1247     v8i16 
filt, out0, out1, out2, out3;
 
 1248     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1249     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1255     filt = 
LD_SH(filter);
 
 1256     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1262     for (loop_cnt = height; loop_cnt--;) {
 
 1264         src2 = 
LD_SB(src + 16);
 
 1265         src3 = 
LD_SB(src + 24);
 
 1266         src1 = __msa_sldi_b(src2, src0, 8);
 
 1270         VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
 
 1272         VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
 
 1274         VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
 
 1276         VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
 
 1278         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 1280         DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
 
 1281                     vec9, vec10, vec11);
 
 1282         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
 
 1284         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1285                      vec8, vec9, vec10, vec11);
 
 1286         ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1290         LD_UB2(dst, 16, dst1, dst2);
 
 1303     uint32_t loop_cnt, cnt;
 
 1304     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
 1305     v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 
 1306     v8i16 
filt, out0, out1, out2, out3;
 
 1307     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1308     v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1314     filt = 
LD_SH(filter);
 
 1315     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1321     for (loop_cnt = height; loop_cnt--;) {
 
 1322         for (cnt = 0; cnt < 2; ++cnt) {
 
 1323             src0 = 
LD_SB(&src[cnt << 5]);
 
 1324             src2 = 
LD_SB(&src[16 + (cnt << 5)]);
 
 1325             src3 = 
LD_SB(&src[24 + (cnt << 5)]);
 
 1326             src1 = __msa_sldi_b(src2, src0, 8);
 
 1329             VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
 
 1331             VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
 
 1333             VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
 
 1335             VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
 
 1337             DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1338                         vec0, vec1, vec2, vec3);
 
 1339             DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
 
 1340                         vec8, vec9, vec10, vec11);
 
 1341             DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
 
 1342                          vec0, vec1, vec2, vec3);
 
 1343             DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
 
 1344                          vec8, vec9, vec10, vec11);
 
 1345             ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
 
 1349             LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
 
 1366     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1367     v16u8 dst0, dst1, dst2, dst3, 
out;
 
 1368     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1369     v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
 
 1370     v16i8 src10998, filt0, filt1, filt2, filt3;
 
 1371     v8i16 
filt, out10, out32;
 
 1373     src -= (3 * src_stride);
 
 1375     filt = 
LD_SH(filter);
 
 1376     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1378     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1379     src += (7 * src_stride);
 
 1381     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
 1383     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1384     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
 
 1388     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1389         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1390         src += (4 * src_stride);
 
 1392         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1393         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1394                    src87_r, src98_r, src109_r);
 
 1395         ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
 
 1398                                     filt1, filt2, filt3);
 
 1400                                     filt1, filt2, filt3);
 
 1404         ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
 1406         dst0 = (v16u8) __msa_ilvr_d((v2i64) dst2, (v2i64) dst0);
 
 1407         out = __msa_aver_u_b(out, dst0);
 
 1409         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1410         dst += (4 * dst_stride);
 
 1426     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1427     v16u8 dst0, dst1, dst2, dst3;
 
 1428     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1429     v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
 
 1430     v8i16 
filt, out0, out1, out2, out3;
 
 1432     src -= (3 * src_stride);
 
 1434     filt = 
LD_SH(filter);
 
 1435     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1437     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1438     src += (7 * src_stride);
 
 1441     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
 1443     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1445     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1446         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1447         src += (4 * src_stride);
 
 1449         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1451         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1452                    src87_r, src98_r, src109_r);
 
 1454                                    filt1, filt2, filt3);
 
 1456                                    filt1, filt2, filt3);
 
 1458                                    filt1, filt2, filt3);
 
 1460                                    filt1, filt2, filt3);
 
 1465         dst += (4 * dst_stride);
 
 1487     uint32_t loop_cnt, cnt;
 
 1488     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1489     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1490     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
 1491     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1492     v16i8 filt0, filt1, filt2, filt3;
 
 1493     v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
 
 1494     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, 
filt;
 
 1496     src -= (3 * src_stride);
 
 1498     filt = 
LD_SH(filter);
 
 1499     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1501     for (cnt = (width >> 4); cnt--;) {
 
 1505         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1507         src_tmp += (7 * src_stride);
 
 1509         ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
 
 1510                    src32_r, src54_r, src21_r);
 
 1511         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1512         ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
 
 1513                    src32_l, src54_l, src21_l);
 
 1514         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1516         for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1517             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
 1518             src_tmp += (4 * src_stride);
 
 1520             LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
 
 1522             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1523                        src87_r, src98_r, src109_r);
 
 1524             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
 1525                        src87_l, src98_l, src109_l);
 
 1527                                          filt0, filt1, filt2, filt3);
 
 1529                                          filt0, filt1, filt2, filt3);
 
 1531                                          filt0, filt1, filt2, filt3);
 
 1533                                          filt0, filt1, filt2, filt3);
 
 1535                                          filt0, filt1, filt2, filt3);
 
 1537                                          filt0, filt1, filt2, filt3);
 
 1539                                          filt0, filt1, filt2, filt3);
 
 1541                                          filt0, filt1, filt2, filt3);
 
 1544             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1545             SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1546             PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1547                         out3_r, tmp0, tmp1, tmp2, tmp3);
 
 1549             AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
 
 1550                         dst0, dst1, dst2, dst3);
 
 1551             ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
 
 1552             dst_tmp += (4 * dst_stride);
 
 1581                                            filter, height, 16);
 
 1591                                            filter, height, 32);
 
 1601                                            filter, height, 64);
 
 1608                                                   const int8_t *filter_horiz,
 
 1609                                                   const int8_t *filter_vert,
 
 1613     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1614     v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
 
 1615     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
 1616     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1617     v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
 
 1618     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
 1621     src -= (3 + 3 * src_stride);
 
 1624     filt = 
LD_SH(filter_horiz);
 
 1625     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1631     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1633     src += (7 * src_stride);
 
 1635     hz_out0 = 
HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
 
 1636                               filt_hz1, filt_hz2, filt_hz3);
 
 1637     hz_out2 = 
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
 1638                               filt_hz1, filt_hz2, filt_hz3);
 
 1639     hz_out4 = 
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
 1640                               filt_hz1, filt_hz2, filt_hz3);
 
 1641     hz_out5 = 
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
 1642                               filt_hz1, filt_hz2, filt_hz3);
 
 1643     SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
 
 1645     filt = 
LD_SH(filter_vert);
 
 1646     SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
 1648     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 1649     vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
 
 1651     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1652         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1654         src += (4 * src_stride);
 
 1656         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1658                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1659         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
 
 1660         vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
 1662                                    filt_vt2, filt_vt3);
 
 1665                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1666         hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
 
 1667         vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
 1669                                    filt_vt2, filt_vt3);
 
 1670         ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
 1677         ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
 
 1678         dst += (4 * dst_stride);
 
 1691                                                   const int8_t *filter_horiz,
 
 1692                                                   const int8_t *filter_vert,
 
 1696     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1697     v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 
 1698     v8i16 
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
 1699     v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
 
 1700     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 1701     v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
 
 1702     v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
 
 1705     src -= (3 + 3 * src_stride);
 
 1708     filt = 
LD_SH(filter_horiz);
 
 1709     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1715     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1716     src += (7 * src_stride);
 
 1719     hz_out0 = 
HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
 
 1720                               filt_hz1, filt_hz2, filt_hz3);
 
 1721     hz_out1 = 
HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
 
 1722                               filt_hz1, filt_hz2, filt_hz3);
 
 1723     hz_out2 = 
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
 
 1724                               filt_hz1, filt_hz2, filt_hz3);
 
 1725     hz_out3 = 
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
 
 1726                               filt_hz1, filt_hz2, filt_hz3);
 
 1727     hz_out4 = 
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
 
 1728                               filt_hz1, filt_hz2, filt_hz3);
 
 1729     hz_out5 = 
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
 
 1730                               filt_hz1, filt_hz2, filt_hz3);
 
 1731     hz_out6 = 
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
 
 1732                               filt_hz1, filt_hz2, filt_hz3);
 
 1734     filt = 
LD_SH(filter_vert);
 
 1735     SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
 1737     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
 
 1738     ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
 
 1739     ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
 
 1741     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1742         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1744         src += (4 * src_stride);
 
 1746         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 1749                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1750         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
 
 1752                                    filt_vt2, filt_vt3);
 
 1755                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1756         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
 
 1758                                    filt_vt2, filt_vt3);
 
 1761                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1762         out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
 
 1764                                    filt_vt2, filt_vt3);
 
 1767                                    filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
 1768         out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
 
 1770                                    filt_vt2, filt_vt3);
 
 1776         dst += (4 * dst_stride);
 
 1792                                                    const int8_t *filter_horiz,
 
 1793                                                    const int8_t *filter_vert,
 
 1798     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 1800                                               filter_horiz, filter_vert,
 
 1812                                                    const int8_t *filter_horiz,
 
 1813                                                    const int8_t *filter_vert,
 
 1818     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 1820                                               filter_horiz, filter_vert,
 
 1832                                                    const int8_t *filter_horiz,
 
 1833                                                    const int8_t *filter_vert,
 
 1838     for (multiple8_cnt = 8; multiple8_cnt--;) {
 
 1840                                               filter_horiz, filter_vert,
 
 1853     v16u8 filt0, vec0, vec1, res0, res1;
 
 1854     v8u16 vec2, vec3, 
filt;
 
 1859     filt = 
LD_UH(filter);
 
 1860     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1862     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1863     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
 
 1864     DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
 
 1867     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 1874     v16u8 vec0, vec1, vec2, vec3, filt0;
 
 1875     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 1876     v16i8 res0, res1, res2, res3;
 
 1877     v8u16 vec4, vec5, vec6, vec7, 
filt;
 
 1882     filt = 
LD_UH(filter);
 
 1883     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1885     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 1886     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
 
 1887     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
 
 1888     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1889                 vec4, vec5, vec6, vec7);
 
 1891     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
 
 1892                 res0, res1, res2, res3);
 
 1893     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 1894     dst += (4 * dst_stride);
 
 1895     ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 
 1900                          int height, 
int mx, 
int my)
 
 1906     } 
else if (8 == height) {
 
 1917     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 1922     filt = 
LD_UH(filter);
 
 1923     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1925     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1926     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 1927     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 1928     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1929                 vec0, vec1, vec2, vec3);
 
 1932     ST8x4_UB(src0, src1, dst, dst_stride);
 
 1941     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 1946     filt = 
LD_UH(filter);
 
 1947     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 1949     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1950     src += (4 * src_stride);
 
 1952     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 1953     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 1954     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1955                 vec0, vec1, vec2, vec3);
 
 1957     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1958     src += (4 * src_stride);
 
 1961     ST8x4_UB(out0, out1, dst, dst_stride);
 
 1962     dst += (4 * dst_stride);
 
 1964     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 1965     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 1966     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1967                 vec0, vec1, vec2, vec3);
 
 1970     ST8x4_UB(out0, out1, dst, dst_stride);
 
 1971     dst += (4 * dst_stride);
 
 1974         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1975         src += (4 * src_stride);
 
 1977         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 1978         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 1979         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1980                     vec0, vec1, vec2, vec3);
 
 1982         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1983         src += (4 * src_stride);
 
 1986         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1988         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 1989         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 1990         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 1991                     vec0, vec1, vec2, vec3);
 
 1994         ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
 
 2000                          int height, 
int mx, 
int my)
 
 2014                           int height, 
int mx, 
int my)
 
 2018     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2019     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2020     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 2024     loop_cnt = (height >> 2) - 1;
 
 2027     filt = 
LD_UH(filter);
 
 2028     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2030     LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2031     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 2032     src += (4 * src_stride);
 
 2034     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2035     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2036     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 2037     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 2038     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2039                 out0, out1, out2, out3);
 
 2040     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2041                 out4, out5, out6, out7);
 
 2053     for (; loop_cnt--;) {
 
 2054         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2055         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 2056         src += (4 * src_stride);
 
 2058         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2059         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2060         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 2061         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 2062         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2063                     out0, out1, out2, out3);
 
 2064         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2065                     out4, out5, out6, out7);
 
 2081                           int height, 
int mx, 
int my)
 
 2085     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2086     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2087     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 2092     filt = 
LD_UH(filter);
 
 2093     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2095     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 2097         src2 = 
LD_SB(src + 16);
 
 2098         src3 = 
LD_SB(src + 24);
 
 2099         src1 = __msa_sldi_b(src2, src0, 8);
 
 2102         src6 = 
LD_SB(src + 16);
 
 2103         src7 = 
LD_SB(src + 24);
 
 2104         src5 = __msa_sldi_b(src6, src4, 8);
 
 2107         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2108         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2109         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 2110         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 2111         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2112                     out0, out1, out2, out3);
 
 2113         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2114                     out4, out5, out6, out7);
 
 2128                           int height, 
int mx, 
int my)
 
 2132     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2133     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2134     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 2139     filt = 
LD_UH(filter);
 
 2140     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2142     for (loop_cnt = height; loop_cnt--;) {
 
 2144         src2 = 
LD_SB(src + 16);
 
 2145         src4 = 
LD_SB(src + 32);
 
 2146         src6 = 
LD_SB(src + 48);
 
 2147         src7 = 
LD_SB(src + 56);
 
 2148         SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
 
 2151         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2152         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2153         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 2154         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 2155         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2156                     out0, out1, out2, out3);
 
 2157         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2158                     out4, out5, out6, out7);
 
 2173     v16i8 
src0, 
src1, src2, src3, src4;
 
 2174     v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
 
 2179     filt = 
LD_SH(filter);
 
 2180     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2182     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 2183     src += (5 * src_stride);
 
 2185     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 2186                src10_r, src21_r, src32_r, src43_r);
 
 2187     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 2188     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
 
 2191     src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 2192     ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 
 2199     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 2200     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
 
 2201     v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
 
 2202     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2206     filt = 
LD_SH(filter);
 
 2207     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2209     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2210     src += (8 * src_stride);
 
 2215     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
 
 2217     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 2219     ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 2220                src87_r, src76_r, src2110, src4332, src6554, src8776);
 
 2221     DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 
 2222                 tmp0, tmp1, tmp2, tmp3);
 
 2225     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
 
 2226     ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 
 2227     ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 
 2232                          int height, 
int mx, 
int my)
 
 2238     } 
else if (8 == height) {
 
 2247     v16u8 
src0, 
src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
 
 2249     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2253     filt = 
LD_SH(filter);
 
 2254     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2256     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
 2257     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
 
 2258     ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
 
 2259     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2260                 tmp0, tmp1, tmp2, tmp3);
 
 2264     ST8x4_UB(out0, out1, dst, dst_stride);
 
 2272     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 2273     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2275     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2279     filt = 
LD_SH(filter);
 
 2280     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2285     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 2286         LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
 
 2287         src += (8 * src_stride);
 
 2289         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 2290                    vec0, vec1, vec2, vec3);
 
 2291         ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 2292                    vec4, vec5, vec6, vec7);
 
 2293         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2294                     tmp0, tmp1, tmp2, tmp3);
 
 2298         ST8x4_UB(out0, out1, dst, dst_stride);
 
 2299         dst += (4 * dst_stride);
 
 2301         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2302                     tmp0, tmp1, tmp2, tmp3);
 
 2306         ST8x4_UB(out0, out1, dst, dst_stride);
 
 2307         dst += (4 * dst_stride);
 
 2315                          int height, 
int mx, 
int my)
 
 2329                           int height, 
int mx, 
int my)
 
 2333     v16u8 
src0, 
src1, src2, src3, src4;
 
 2334     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2335     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2339     filt = 
LD_SH(filter);
 
 2340     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2345     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2346         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 2347         src += (4 * src_stride);
 
 2349         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 2350         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 2351         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2357         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 2358         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 2359         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2365         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2371         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2383                           int height, 
int mx, 
int my)
 
 2387     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 2388     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2389     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 2393     filt = 
LD_SH(filter);
 
 2394     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2397     src5 = 
LD_UB(src + 16);
 
 2400     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2401         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 2402         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 2403         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 2405         LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
 
 2406         src += (4 * src_stride);
 
 2408         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2412         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2417         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 2418         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 2419         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2424         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2429         ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
 
 2430         ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
 
 2431         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2436         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2441         ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
 
 2442         ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
 
 2443         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 2446         PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
 
 2448         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 2451         PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
 
 2452         dst += (4 * dst_stride);
 
 2461                           int height, 
int mx, 
int my)
 
 2465     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2466     v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 2467     v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2471     filt = 
LD_SH(filter);
 
 2472     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 2474     LD_UB4(src, 16, src0, src3, src6, src9);
 
 2477     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 2478         LD_UB2(src, src_stride, src1, src2);
 
 2479         LD_UB2(src + 16, src_stride, src4, src5);
 
 2480         LD_UB2(src + 32, src_stride, src7, src8);
 
 2481         LD_UB2(src + 48, src_stride, src10, src11);
 
 2482         src += (2 * src_stride);
 
 2484         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 2485         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 2486         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2491         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2496         ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
 
 2497         ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
 
 2498         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 2503         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 2508         ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
 
 2509         ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
 
 2510         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 2515         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 2520         ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
 
 2521         ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
 
 2522         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 2527         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 2531         dst += (2 * dst_stride);
 
 2542                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2545     v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
 
 2546     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, 
filt, tmp0, tmp1;
 
 2551     filt = 
LD_UH(filter_horiz);
 
 2552     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2554     filt = 
LD_UH(filter_vert);
 
 2555     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2557     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 2561     hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
 
 2562     hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
 
 2564     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2565     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 2569     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 2574                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2576     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, 
mask;
 
 2577     v16i8 res0, res1, res2, res3;
 
 2578     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
 
 2579     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 2580     v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, 
filt;
 
 2585     filt = 
LD_UH(filter_horiz);
 
 2586     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2588     filt = 
LD_UH(filter_vert);
 
 2589     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2591     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2592     src += (8 * src_stride);
 
 2600     SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
 
 2601                hz_out3, hz_out5, 8);
 
 2602     hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
 
 2604     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2605     ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 
 2606     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 
 2607                 vec4, vec5, vec6, vec7);
 
 2610     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
 
 2611                 res0, res1, res2, res3);
 
 2612     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 2613     dst += (4 * dst_stride);
 
 2614     ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 
 2619                           int height, 
int mx, 
int my)
 
 2626                                   filter_horiz, filter_vert);
 
 2627     } 
else if (8 == height) {
 
 2629                                   filter_horiz, filter_vert);
 
 2635                                const int8_t *filter_horiz, 
const int8_t *filter_vert)
 
 2638     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
 
 2639     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 2645     filt = 
LD_SH(filter_horiz);
 
 2646     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 2648     filt = 
LD_SH(filter_vert);
 
 2649     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 2651     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 2655     vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2656     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 2659     vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2660     tmp1 = __msa_dotp_u_h(vec1, filt_vt);
 
 2663     vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2664     tmp2 = __msa_dotp_u_h(vec2, filt_vt);
 
 2667     vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2668     tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
 2673     ST8x4_UB(out0, out1, dst, dst_stride);
 
 2678                                    const int8_t *filter_horiz, 
const int8_t *filter_vert,
 
 2683     v16u8 filt_hz, filt_vt, vec0;
 
 2684     v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 
 2690     filt = 
LD_SH(filter_horiz);
 
 2691     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 2693     filt = 
LD_SH(filter_vert);
 
 2694     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 2701     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 2702         LD_SB4(src, src_stride, src1, src2, src3, src4);
 
 2703         src += (4 * src_stride);
 
 2706         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2707         tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
 2710         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2711         tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
 2717         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2718         tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
 2721         LD_SB4(src, src_stride, src1, src2, src3, src4);
 
 2722         src += (4 * src_stride);
 
 2723         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2724         tmp4 = __msa_dotp_u_h(vec0, filt_vt);
 
 2729         ST8x4_UB(out0, out1, dst, dst_stride);
 
 2730         dst += (4 * dst_stride);
 
 2733         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2734         tmp5 = __msa_dotp_u_h(vec0, filt_vt);
 
 2737         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2738         tmp6 = __msa_dotp_u_h(vec0, filt_vt);
 
 2741         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 2742         tmp7 = __msa_dotp_u_h(vec0, filt_vt);
 
 2745         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 2746         tmp8 = __msa_dotp_u_h(vec0, filt_vt);
 
 2751         ST8x4_UB(out0, out1, dst, dst_stride);
 
 2752         dst += (4 * dst_stride);
 
 2758                           int height, 
int mx, 
int my)
 
 2765                                   filter_horiz, filter_vert);
 
 2768                                       filter_horiz, filter_vert, height);
 
 2774                            int height, 
int mx, 
int my)
 
 2779     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2780     v16u8 filt_hz, filt_vt, vec0, vec1;
 
 2781     v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
 
 2787     filt = 
LD_SH(filter_horiz);
 
 2788     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 2790     filt = 
LD_SH(filter_vert);
 
 2791     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 2793     LD_SB2(src, 8, src0, src1);
 
 2800     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2801         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2802         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 2803         src += (4 * src_stride);
 
 2807         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2808         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2816         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 2817         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2825         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 2826         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2834         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 2835         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
 
 2845                            int height, 
int mx, 
int my)
 
 2849     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 2859                            int height, 
int mx, 
int my)
 
 2863     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 2877     v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
 
 2878     v8u16 vec2, vec3, 
filt;
 
 2883     filt = 
LD_UH(filter);
 
 2884     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2886     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2887     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 2888     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
 
 2889     DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
 
 2892     ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
 2894     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 2902     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 2903     v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 
 2904     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2905     v8u16 vec4, vec5, vec6, vec7, 
filt;
 
 2910     filt = 
LD_UH(filter);
 
 2911     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2913     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2914     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 2915     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
 
 2916     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
 
 2917     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
 
 2920     PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
 
 2922     ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
 
 2924     AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
 
 2926     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 2927     dst += (4 * dst_stride);
 
 2928     ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 
 2933                          int height, 
int mx, 
int my)
 
 2940     } 
else if (8 == height) {
 
 2952     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 2953     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 2958     filt = 
LD_UH(filter);
 
 2959     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2961     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2962     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2963     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2964     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2965                 vec0, vec1, vec2, vec3);
 
 2967     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 2980     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 2981     v8u16 vec0, vec1, vec2, vec3, 
filt;
 
 2986     filt = 
LD_UH(filter);
 
 2987     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 2989     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2990     src += (4 * src_stride);
 
 2991     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 2992     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 2993     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
 
 2996     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 2997     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2998     src += (4 * src_stride);
 
 3001     dst += (4 * dst_stride);
 
 3003     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3004     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3005     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
 
 3008     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3011     dst += (4 * dst_stride);
 
 3014         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 3015         src += (4 * src_stride);
 
 3017         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3018         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3019         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 3022         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3023         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 3026         dst += (4 * dst_stride);
 
 3028         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3029         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3030         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
 
 3033         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3041                          int height, 
int mx, 
int my)
 
 3056                           int height, 
int mx, 
int my)
 
 3060     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3061     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3062     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3063     v8u16 res0, res1, res2, res3, res4, res5, res6, res7, 
filt;
 
 3068     filt = 
LD_UH(filter);
 
 3069     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3071     LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 3072     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 3073     src += (4 * src_stride);
 
 3075     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3076     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3077     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 3078     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 3079     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
 
 3081     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
 
 3085     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3095     for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
 
 3096         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 3097         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 3098         src += (4 * src_stride);
 
 3100         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3101         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3102         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 3103         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 3104         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
 
 3106         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
 
 3110         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3124                           int height, 
int mx, 
int my)
 
 3128     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3129     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3130     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3131     v8u16 res0, res1, res2, res3, res4, res5, res6, res7, 
filt;
 
 3136     filt = 
LD_UH(filter);
 
 3137     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3139     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 3141         src2 = 
LD_SB(src + 16);
 
 3142         src3 = 
LD_SB(src + 24);
 
 3143         src1 = __msa_sldi_b(src2, src0, 8);
 
 3146         src6 = 
LD_SB(src + 16);
 
 3147         src7 = 
LD_SB(src + 24);
 
 3148         src5 = __msa_sldi_b(src6, src4, 8);
 
 3151         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3152         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3153         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 3154         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 3155         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3156                     res0, res1, res2, res3);
 
 3157         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3158                     res4, res5, res6, res7);
 
 3161         LD_UB2(dst, 16, dst0, dst1);
 
 3165         LD_UB2(dst, 16, dst2, dst3);
 
 3174                           int height, 
int mx, 
int my)
 
 3178     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3179     v16u8 filt0, dst0, dst1, dst2, dst3;
 
 3180     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3181     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, 
filt;
 
 3186     filt = 
LD_UH(filter);
 
 3187     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3189     for (loop_cnt = height; loop_cnt--;) {
 
 3190         LD_SB4(src, 16, src0, src2, src4, src6);
 
 3191         src7 = 
LD_SB(src + 56);
 
 3192         SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
 
 3195         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
 
 3196         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
 
 3197         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
 
 3198         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
 
 3199         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3200                     out0, out1, out2, out3);
 
 3201         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3202                     out4, out5, out6, out7);
 
 3205         LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
 
 3219     v16i8 
src0, 
src1, src2, src3, src4;
 
 3220     v16u8 dst0, dst1, dst2, dst3, 
out, filt0, src2110, src4332;
 
 3221     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3225     filt = 
LD_SH(filter);
 
 3226     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 3228     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 3229     src += (4 * src_stride);
 
 3234     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3235     ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 
 3236     dst0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
 
 3237     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 3238                src10_r, src21_r, src32_r, src43_r);
 
 3239     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 3240     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
 
 3244     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 3245     out = __msa_aver_u_b(out, dst0);
 
 3247     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 3255     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3256     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
 
 3257     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 
 3258     v16u8 src2110, src4332, src6554, src8776, filt0;
 
 3259     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3262     filt = 
LD_SH(filter);
 
 3263     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 3265     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 3266     src += (8 * src_stride);
 
 3269     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 3270     ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
 
 3272     ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 
 3273     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
 
 3275     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 3277     ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 3278                src87_r, src76_r, src2110, src4332, src6554, src8776);
 
 3279     DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 
 3280                 tmp0, tmp1, tmp2, tmp3);
 
 3283     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
 
 3284     AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
 
 3285     ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 
 3286     dst += (4 * dst_stride);
 
 3287     ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
 
 3292                          int height, 
int mx, 
int my)
 
 3299     } 
else if (8 == height) {
 
 3311     v16u8 
src0, 
src1, src2, src3, src4;
 
 3312     v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
 
 3313     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3317     filt = 
LD_SH(filter);
 
 3318     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 3320     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
 
 3321     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3322     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
 
 3323     ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
 
 3324     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3325                 tmp0, tmp1, tmp2, tmp3);
 
 3340     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 3341     v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 3342     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 3343     v8u16 tmp0, tmp1, tmp2, tmp3;
 
 3347     filt = 
LD_SH(filter);
 
 3348     filt0 = (v16u8) __msa_splati_h(filt, 0);
 
 3353     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 3354         LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
 
 3355         src += (8 * src_stride);
 
 3356         LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
 
 3358         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
 
 3359                    vec0, vec1, vec2, vec3);
 
 3360         ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 3361                    vec4, vec5, vec6, vec7);
 
 3362         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 3363                     tmp0, tmp1, tmp2, tmp3);
 
 3367                            dst4, dst, dst_stride);
 
 3368         dst += (4 * dst_stride);
 
 3370         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 3371                     tmp0, tmp1, tmp2, tmp3);
 
 3375                            dst8, dst, dst_stride);
 
 3376         dst += (4 * dst_stride);
 
 3384                          int height, 
int mx, 
int my)
 
 3399                           int height, 
int mx, 
int my)
 
 3403     v16u8 
src0, 
src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
 
 3404     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3405     v8u16 tmp0, tmp1, tmp2, tmp3, 
filt;
 
 3408     filt = 
LD_UH(filter);
 
 3409     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3414     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3415         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 3416         src += (4 * src_stride);
 
 3418         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3419         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 3420         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 3421         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3427         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 3428         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 3429         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3435         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3441         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3453                           int height, 
int mx, 
int my)
 
 3457     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 3458     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3459     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
 
 3460     v8u16 tmp0, tmp1, tmp2, tmp3, 
filt;
 
 3463     filt = 
LD_UH(filter);
 
 3464     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3466     LD_UB2(src, 16, src0, src5);
 
 3469     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3470         LD_UB4(src, src_stride, src1, src2, src3, src4);
 
 3471         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3472         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 3473         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 3475         LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
 
 3476         LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
 
 3477         src += (4 * src_stride);
 
 3479         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3484         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3489         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
 
 3490         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
 
 3491         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3496         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3501         ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
 
 3502         ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
 
 3503         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3508         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3513         ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
 
 3514         ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
 
 3515         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
 
 3520         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
 
 3524         dst += (4 * dst_stride);
 
 3533                           int height, 
int mx, 
int my)
 
 3537     v16u8 
src0, 
src1, src2, src3, src4, src5;
 
 3538     v16u8 src6, src7, src8, src9, src10, src11, filt0;
 
 3539     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3540     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3541     v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 3545     filt = 
LD_UH(filter);
 
 3546     filt0 = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3548     LD_UB4(src, 16, src0, src3, src6, src9);
 
 3551     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 3552         LD_UB2(src, src_stride, src1, src2);
 
 3553         LD_UB2(dst, dst_stride, dst0, dst1);
 
 3554         LD_UB2(src + 16, src_stride, src4, src5);
 
 3555         LD_UB2(dst + 16, dst_stride, dst2, dst3);
 
 3556         LD_UB2(src + 32, src_stride, src7, src8);
 
 3557         LD_UB2(dst + 32, dst_stride, dst4, dst5);
 
 3558         LD_UB2(src + 48, src_stride, src10, src11);
 
 3559         LD_UB2(dst + 48, dst_stride, dst6, dst7);
 
 3560         src += (2 * src_stride);
 
 3562         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
 
 3563         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
 3564         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3569         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3574         ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
 
 3575         ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
 
 3576         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 3581         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 3586         ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
 
 3587         ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
 
 3588         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
 
 3593         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
 
 3598         ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
 
 3599         ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
 
 3600         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
 
 3605         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
 
 3609         dst += (2 * dst_stride);
 
 3622                                                    const int8_t *filter_horiz,
 
 3623                                                    const int8_t *filter_vert)
 
 3626     v16u8 filt_hz, filt_vt, vec0, vec1;
 
 3627     v16u8 dst0, dst1, dst2, dst3, res0, res1;
 
 3628     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, 
filt;
 
 3633     filt = 
LD_UH(filter_horiz);
 
 3634     filt_hz = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3636     filt = 
LD_UH(filter_vert);
 
 3637     filt_vt = (v16u8) __msa_splati_h((v8i16) 
filt, 0);
 
 3639     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 3644     hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
 
 3645     hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
 
 3646     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3648     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3649     ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
 3650     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3655     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 3662                                                    const int8_t *filter_horiz,
 
 3663                                                    const int8_t *filter_vert)
 
 3665     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, 
mask;
 
 3666     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 
 3667     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3668     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
 
 3669     v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
 
 3675     filt = 
LD_SH(filter_horiz);
 
 3676     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 3678     filt = 
LD_SH(filter_vert);
 
 3679     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 3681     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 3682     src += (8 * src_stride);
 
 3690     SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
 
 3691                hz_out3, hz_out5, 8);
 
 3692     hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
 
 3694     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 3695     ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
 
 3697     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3698     ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 
 3699     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 
 3700                 tmp0, tmp1, tmp2, tmp3);
 
 3703     PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
 
 3705     AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
 
 3707     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 
 3708     dst += (4 * dst_stride);
 
 3709     ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 
 3714                           int height, 
int mx, 
int my)
 
 3721                                                filter_horiz, filter_vert);
 
 3722     } 
else if (8 == height) {
 
 3724                                                filter_horiz, filter_vert);
 
 3732                                                    const int8_t *filter_horiz,
 
 3733                                                    const int8_t *filter_vert)
 
 3736     v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
 
 3737     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 3743     filt = 
LD_SH(filter_horiz);
 
 3744     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 3746     filt = 
LD_SH(filter_vert);
 
 3747     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 3749     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 3750     src += (5 * src_stride);
 
 3752     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3755     vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3756     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 3759     vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3760     tmp1 = __msa_dotp_u_h(vec1, filt_vt);
 
 3763     vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3764     tmp2 = __msa_dotp_u_h(vec2, filt_vt);
 
 3767     vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3768     tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
 3780                                                        const int8_t *filter_horiz,
 
 3781                                                        const int8_t *filter_vert,
 
 3786     v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
 
 3787     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 
 3793     filt = 
LD_SH(filter_horiz);
 
 3794     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 3796     filt = 
LD_SH(filter_vert);
 
 3797     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 3804     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3805         LD_SB4(src, src_stride, src1, src2, src3, src4);
 
 3806         src += (4 * src_stride);
 
 3809         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3810         tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
 3813         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3814         tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
 3820         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
 
 3821         tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
 3824         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
 
 3825         tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
 3829         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3831                            dst3, dst, dst_stride);
 
 3832         dst += (4 * dst_stride);
 
 3838                           int height, 
int mx, 
int my)
 
 3845                                                filter_horiz, filter_vert);
 
 3849                                                    filter_horiz, filter_vert,
 
 3856                            int height, 
int mx, 
int my)
 
 3861     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, 
mask;
 
 3862     v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
 
 3863     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
 
 3869     filt = 
LD_SH(filter_horiz);
 
 3870     filt_hz = (v16u8) __msa_splati_h(filt, 0);
 
 3872     filt = 
LD_SH(filter_vert);
 
 3873     filt_vt = (v16u8) __msa_splati_h(filt, 0);
 
 3875     LD_SB2(src, 8, src0, src1);
 
 3881     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3882         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 3883         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 3884         src += (4 * src_stride);
 
 3885         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 3889         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3890         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3898         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 3899         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3907         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
 3908         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3916         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
 
 3917         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
 
 3927                            int height, 
int mx, 
int my)
 
 3931     for (multiple8_cnt = 2; multiple8_cnt--;) {
 
 3941                            int height, 
int mx, 
int my)
 
 3945     for (multiple8_cnt = 4; multiple8_cnt--;) {
 
 3958     uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
 
 3959     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 3961     if (0 == height % 12) {
 
 3962         for (cnt = (height / 12); cnt--;) {
 
 3964                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 3965             src += (8 * src_stride);
 
 3967             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 3968             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 3969             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 3970             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 3971             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
 3972             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
 3973             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
 3974             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
 3976             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 3977             dst += (4 * dst_stride);
 
 3978             SD4(out4, out5, out6, out7, dst, dst_stride);
 
 3979             dst += (4 * dst_stride);
 
 3981             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 3982             src += (4 * src_stride);
 
 3984             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 3985             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 3986             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 3987             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 3989             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 3990             dst += (4 * dst_stride);
 
 3992     } 
else if (0 == height % 8) {
 
 3993         for (cnt = height >> 3; cnt--;) {
 
 3995                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 3996             src += (8 * src_stride);
 
 3998             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 3999             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 4000             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 4001             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 4002             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
 4003             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
 4004             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
 4005             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
 4007             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 4008             dst += (4 * dst_stride);
 
 4009             SD4(out4, out5, out6, out7, dst, dst_stride);
 
 4010             dst += (4 * dst_stride);
 
 4012     } 
else if (0 == height % 4) {
 
 4013         for (cnt = (height / 4); cnt--;) {
 
 4014             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4015             src += (4 * src_stride);
 
 4016             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 4017             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 4018             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
 4019             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
 4021             SD4(out0, out1, out2, out3, dst, dst_stride);
 
 4022             dst += (4 * dst_stride);
 
 4024     } 
else if (0 == height % 2) {
 
 4025         for (cnt = (height / 2); cnt--;) {
 
 4026             LD_UB2(src, src_stride, src0, src1);
 
 4027             src += (2 * src_stride);
 
 4028             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
 4029             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
 4046     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4048     for (cnt = (width >> 4); cnt--;) {
 
 4052         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 4053             LD_UB8(src_tmp, src_stride,
 
 4054                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 4055             src_tmp += (8 * src_stride);
 
 4057             ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
 
 4058                    dst_tmp, dst_stride);
 
 4059             dst_tmp += (8 * dst_stride);
 
 4072     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4074     if (0 == height % 12) {
 
 4075         for (cnt = (height / 12); cnt--;) {
 
 4077                    src0, src1, src2, src3, src4, src5, src6, src7);
 
 4078             src += (8 * src_stride);
 
 4079             ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
 
 4081             dst += (8 * dst_stride);
 
 4083             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4084             src += (4 * src_stride);
 
 4085             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4086             dst += (4 * dst_stride);
 
 4088     } 
else if (0 == height % 8) {
 
 4090     } 
else if (0 == height % 4) {
 
 4091         for (cnt = (height >> 2); cnt--;) {
 
 4092             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4093             src += (4 * src_stride);
 
 4095             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4096             dst += (4 * dst_stride);
 
 4106     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4108     if (0 == height % 12) {
 
 4109         for (cnt = (height / 12); cnt--;) {
 
 4110             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4111             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
 4112             src += (4 * src_stride);
 
 4113             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4114             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
 4115             dst += (4 * dst_stride);
 
 4117             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4118             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
 4119             src += (4 * src_stride);
 
 4120             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4121             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
 4122             dst += (4 * dst_stride);
 
 4124             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4125             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
 4126             src += (4 * src_stride);
 
 4127             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4128             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
 4129             dst += (4 * dst_stride);
 
 4131     } 
else if (0 == height % 8) {
 
 4133     } 
else if (0 == height % 4) {
 
 4134         for (cnt = (height >> 2); cnt--;) {
 
 4135             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4136             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
 4137             src += (4 * src_stride);
 
 4138             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
 4139             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
 4140             dst += (4 * dst_stride);
 
 4157     uint32_t out0, out1, out2, out3;
 
 4159     v16u8 dst0, dst1, dst2, dst3;
 
 4161     if (0 == (height % 4)) {
 
 4162         for (cnt = (height / 4); cnt--;) {
 
 4163             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4164             src += (4 * src_stride);
 
 4166             LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 4168             AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 4169                         dst0, dst1, dst2, dst3);
 
 4171             out0 = __msa_copy_u_w((v4i32) dst0, 0);
 
 4172             out1 = __msa_copy_u_w((v4i32) dst1, 0);
 
 4173             out2 = __msa_copy_u_w((v4i32) dst2, 0);
 
 4174             out3 = __msa_copy_u_w((v4i32) dst3, 0);
 
 4175             SW4(out0, out1, out2, out3, dst, dst_stride);
 
 4176             dst += (4 * dst_stride);
 
 4178     } 
else if (0 == (height % 2)) {
 
 4179         for (cnt = (height / 2); cnt--;) {
 
 4180             LD_UB2(src, src_stride, src0, src1);
 
 4181             src += (2 * src_stride);
 
 4183             LD_UB2(dst, dst_stride, dst0, dst1);
 
 4187             out0 = __msa_copy_u_w((v4i32) dst0, 0);
 
 4188             out1 = __msa_copy_u_w((v4i32) dst1, 0);
 
 4202     uint64_t out0, out1, out2, out3;
 
 4204     v16u8 dst0, dst1, dst2, dst3;
 
 4206     for (cnt = (height / 4); cnt--;) {
 
 4207         LD_UB4(src, src_stride, src0, src1, src2, src3);
 
 4208         src += (4 * src_stride);
 
 4209         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
 4211         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 4212                     dst0, dst1, dst2, dst3);
 
 4214         out0 = __msa_copy_u_d((v2i64) dst0, 0);
 
 4215         out1 = __msa_copy_u_d((v2i64) dst1, 0);
 
 4216         out2 = __msa_copy_u_d((v2i64) dst2, 0);
 
 4217         out3 = __msa_copy_u_d((v2i64) dst3, 0);
 
 4218         SD4(out0, out1, out2, out3, dst, dst_stride);
 
 4219         dst += (4 * dst_stride);
 
 4228     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4229     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4231     for (cnt = (height / 8); cnt--;) {
 
 4232         LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 4233         src += (8 * src_stride);
 
 4234         LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 4236         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 4237                     dst0, dst1, dst2, dst3);
 
 4238         AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4239                     dst4, dst5, dst6, dst7);
 
 4240         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
 
 4241         dst += (8 * dst_stride);
 
 4251     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4252     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
 4253     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4254     v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
 4256     for (cnt = (height / 8); cnt--;) {
 
 4257         LD_UB4(src, src_stride, src0, src2, src4, src6);
 
 4258         LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
 
 4259         src += (4 * src_stride);
 
 4260         LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
 
 4261         LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
 
 4262         dst_dup += (4 * dst_stride);
 
 4263         LD_UB4(src, src_stride, src8, src10, src12, src14);
 
 4264         LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
 
 4265         src += (4 * src_stride);
 
 4266         LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
 
 4267         LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
 
 4268         dst_dup += (4 * dst_stride);
 
 4270         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 4271                     dst0, dst1, dst2, dst3);
 
 4272         AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4273                     dst4, dst5, dst6, dst7);
 
 4274         AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 
 4275                     dst8, dst9, dst10, dst11);
 
 4276         AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 
 4277                     dst12, dst13, dst14, dst15);
 
 4279         ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
 
 4280         ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
 
 4281         dst += (4 * dst_stride);
 
 4282         ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
 
 4283         ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
 
 4284         dst += (4 * dst_stride);
 
 4294     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 4295     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
 
 4296     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4297     v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
 4299     for (cnt = (height / 4); cnt--;) {
 
 4300         LD_UB4(src, 16, src0, src1, src2, src3);
 
 4302         LD_UB4(src, 16, src4, src5, src6, src7);
 
 4304         LD_UB4(src, 16, src8, src9, src10, src11);
 
 4306         LD_UB4(src, 16, src12, src13, src14, src15);
 
 4309         LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
 
 4310         dst_dup += dst_stride;
 
 4311         LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
 
 4312         dst_dup += dst_stride;
 
 4313         LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
 
 4314         dst_dup += dst_stride;
 
 4315         LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
 
 4316         dst_dup += dst_stride;
 
 4318         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 
 4319                     dst0, dst1, dst2, dst3);
 
 4320         AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 
 4321                     dst4, dst5, dst6, dst7);
 
 4322         AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 
 4323                     dst8, dst9, dst10, dst11);
 
 4324         AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 
 4325                     dst12, dst13, dst14, dst15);
 
 4327         ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
 
 4329         ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
 
 4331         ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
 
 4333         ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
 
 4340          {0, 1, -5, 126, 8, -3, 1, 0},
 
 4341          {-1, 3, -10, 122, 18, -6, 2, 0},
 
 4342          {-1, 4, -13, 118, 27, -9, 3, -1},
 
 4343          {-1, 4, -16, 112, 37, -11, 4, -1},
 
 4344          {-1, 5, -18, 105, 48, -14, 4, -1},
 
 4345          {-1, 5, -19, 97, 58, -16, 5, -1},
 
 4346          {-1, 6, -19, 88, 68, -18, 5, -1},
 
 4347          {-1, 6, -19, 78, 78, -19, 6, -1},
 
 4348          {-1, 5, -18, 68, 88, -19, 6, -1},
 
 4349          {-1, 5, -16, 58, 97, -19, 5, -1},
 
 4350          {-1, 4, -14, 48, 105, -18, 5, -1},
 
 4351          {-1, 4, -11, 37, 112, -16, 4, -1},
 
 4352          {-1, 3, -9, 27, 118, -13, 4, -1},
 
 4353          {0, 2, -6, 18, 122, -10, 3, -1},
 
 4354          {0, 1, -3, 8, 126, -5, 1, 0},
 
 4356         {-1, 3, -7, 127, 8, -3, 1, 0},
 
 4357         {-2, 5, -13, 125, 17, -6, 3, -1},
 
 4358         {-3, 7, -17, 121, 27, -10, 5, -2},
 
 4359         {-4, 9, -20, 115, 37, -13, 6, -2},
 
 4360         {-4, 10, -23, 108, 48, -16, 8, -3},
 
 4361         {-4, 10, -24, 100, 59, -19, 9, -3},
 
 4362         {-4, 11, -24, 90, 70, -21, 10, -4},
 
 4363         {-4, 11, -23, 80, 80, -23, 11, -4},
 
 4364         {-4, 10, -21, 70, 90, -24, 11, -4},
 
 4365         {-3, 9, -19, 59, 100, -24, 10, -4},
 
 4366         {-3, 8, -16, 48, 108, -23, 10, -4},
 
 4367         {-2, 6, -13, 37, 115, -20, 9, -4},
 
 4368         {-2, 5, -10, 27, 121, -17, 7, -3},
 
 4369         {-1, 3, -6, 17, 125, -13, 5, -2},
 
 4370         {0, 1, -3, 8, 127, -7, 3, -1},
 
 4372         {-3, -1, 32, 64, 38, 1, -3, 0},
 
 4373         {-2, -2, 29, 63, 41, 2, -3, 0},
 
 4374         {-2, -2, 26, 63, 43, 4, -4, 0},
 
 4375         {-2, -3, 24, 62, 46, 5, -4, 0},
 
 4376         {-2, -3, 21, 60, 49, 7, -4, 0},
 
 4377         {-1, -4, 18, 59, 51, 9, -4, 0},
 
 4378         {-1, -4, 16, 57, 53, 12, -4, -1},
 
 4379         {-1, -4, 14, 55, 55, 14, -4, -1},
 
 4380         {-1, -4, 12, 53, 57, 16, -4, -1},
 
 4381         {0, -4, 9, 51, 59, 18, -4, -1},
 
 4382         {0, -4, 7, 49, 60, 21, -3, -2},
 
 4383         {0, -4, 5, 46, 62, 24, -3, -2},
 
 4384         {0, -4, 4, 43, 63, 26, -2, -2},
 
 4385         {0, -3, 2, 41, 63, 29, -2, -2},
 
 4386         {0, -3, 1, 38, 64, 32, -1, -3},
 
 4390 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \ 
 4391 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4392                                         const uint8_t *src,                    \ 
 4393                                         ptrdiff_t srcstride,                   \ 
 4394                                         int h, int mx, int my)                 \ 
 4396     const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \ 
 4398     common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \ 
 4401 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4402                                         const uint8_t *src,                    \ 
 4403                                         ptrdiff_t srcstride,                   \ 
 4404                                         int h, int mx, int my)                 \ 
 4406     const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \ 
 4408     common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \ 
 4411 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \ 
 4412                                          const uint8_t *src,                   \ 
 4413                                          ptrdiff_t srcstride,                  \ 
 4414                                          int h, int mx, int my)                \ 
 4416     const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \ 
 4417     const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \ 
 4419     common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \ 
 4423 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4424                                         const uint8_t *src,                    \ 
 4425                                         ptrdiff_t srcstride,                   \ 
 4426                                         int h, int mx, int my)                 \ 
 4428     const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \ 
 4430     common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \ 
 4431                                             dststride, filter, h);             \ 
 4434 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \ 
 4435                                         const uint8_t *src,                    \ 
 4436                                         ptrdiff_t srcstride,                   \ 
 4437                                         int h, int mx, int my)                 \ 
 4439     const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \ 
 4441     common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \ 
 4445 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \ 
 4446                                          const uint8_t *src,                   \ 
 4447                                          ptrdiff_t srcstride,                  \ 
 4448                                          int h, int mx, int my)                \ 
 4450     const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \ 
 4451     const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \ 
 4453     common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \ 
 4454                                                  dststride, hfilter,           \ 
 4458 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \ 
 4459 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \ 
 4460                          const uint8_t *src, ptrdiff_t srcstride,  \ 
 4461                          int h, int mx, int my)                    \ 
 4464     copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \ 
 4467 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \ 
 4468                         const uint8_t *src, ptrdiff_t srcstride,   \ 
 4469                         int h, int mx, int my)                     \ 
 4472     avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \ 
 4475 #define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \ 
 4476 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \ 
 4477                         const uint8_t *src, ptrdiff_t srcstride,  \ 
 4478                         int h, int mx, int my)                    \ 
 4481     avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \ 
 4508 #undef VP9_8TAP_MIPS_MSA_FUNC 
 4509 #undef VP9_COPY_AVG_MIPS_MSA_FUNC 
 4510 #undef VP9_AVG_MIPS_MSA_FUNC 
void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SB(...)
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_XORI128_UB(in0, in1)
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B3_128_SB(...)
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define DPADD_SB4_SH(...)
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t vp9_subpel_filters_msa[3][15][8]
#define XORI_B4_128_UB(...)
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_ST_SB(in0, in1, pdst)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
#define XORI_B2_128_UB(...)
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static const uint16_t mask[17]
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,filt_h0, filt_h1, filt_h2, filt_h3)
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B7_128_SB(...)
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define SW4(in0, in1, in2, in3, pdst, stride)
#define XORI_B4_128_SB(...)
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_H4_SB(...)
static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, dst2, dst3, pdst, stride)
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t vp9_bilinear_filters_msa[15][2]
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static const int8_t filt[NUMTAPS]
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,pdst, stride)
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x8_UB(in0, in1, pdst, stride)
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
#define ST8x4_UB(in0, in1, pdst, stride)
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static const uint8_t mc_filt_mask_arr[16 *3]
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)