30     uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
 
   31     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
   33     if (0 == height % 12) {
 
   34         for (cnt = (height / 12); cnt--;) {
 
   36                    src0, src1, src2, src3, src4, src5, src6, src7);
 
   37             src += (8 * src_stride);
 
   39             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
   40             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
   41             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
   42             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
   43             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
   44             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
   45             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
   46             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
   48             SD4(out0, out1, out2, out3, dst, dst_stride);
 
   49             dst += (4 * dst_stride);
 
   50             SD4(out4, out5, out6, out7, dst, dst_stride);
 
   51             dst += (4 * dst_stride);
 
   53             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
   54             src += (4 * src_stride);
 
   56             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
   57             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
   58             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
   59             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
   61             SD4(out0, out1, out2, out3, dst, dst_stride);
 
   62             dst += (4 * dst_stride);
 
   64     } 
else if (0 == height % 8) {
 
   65         for (cnt = height >> 3; cnt--;) {
 
   67                    src0, src1, src2, src3, src4, src5, src6, src7);
 
   68             src += (8 * src_stride);
 
   70             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
   71             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
   72             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
   73             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
   74             out4 = __msa_copy_u_d((v2i64) src4, 0);
 
   75             out5 = __msa_copy_u_d((v2i64) src5, 0);
 
   76             out6 = __msa_copy_u_d((v2i64) src6, 0);
 
   77             out7 = __msa_copy_u_d((v2i64) src7, 0);
 
   79             SD4(out0, out1, out2, out3, dst, dst_stride);
 
   80             dst += (4 * dst_stride);
 
   81             SD4(out4, out5, out6, out7, dst, dst_stride);
 
   82             dst += (4 * dst_stride);
 
   84     } 
else if (0 == height % 4) {
 
   85         for (cnt = (height / 4); cnt--;) {
 
   86             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
   87             src += (4 * src_stride);
 
   88             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
   89             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
   90             out2 = __msa_copy_u_d((v2i64) src2, 0);
 
   91             out3 = __msa_copy_u_d((v2i64) src3, 0);
 
   93             SD4(out0, out1, out2, out3, dst, dst_stride);
 
   94             dst += (4 * dst_stride);
 
   96     } 
else if (0 == height % 2) {
 
   97         for (cnt = (height / 2); cnt--;) {
 
   98             LD_UB2(src, src_stride, src0, src1);
 
   99             src += (2 * src_stride);
 
  100             out0 = __msa_copy_u_d((v2i64) src0, 0);
 
  101             out1 = __msa_copy_u_d((v2i64) src1, 0);
 
  115     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  117     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  118     src += (8 * src_stride);
 
  119     ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
  120     dst += (8 * dst_stride);
 
  121     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  122     ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
 
  131     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  133     for (cnt = (width >> 4); cnt--;) {
 
  137         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  138             LD_UB8(src_tmp, src_stride,
 
  139                    src0, src1, src2, src3, src4, src5, src6, src7);
 
  140             src_tmp += (8 * src_stride);
 
  142             ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
 
  143                    dst_tmp, dst_stride);
 
  144             dst_tmp += (8 * dst_stride);
 
  157     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  159     if (0 == height % 12) {
 
  160         for (cnt = (height / 12); cnt--;) {
 
  162                    src0, src1, src2, src3, src4, src5, src6, src7);
 
  163             src += (8 * src_stride);
 
  164             ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
 
  166             dst += (8 * dst_stride);
 
  168             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  169             src += (4 * src_stride);
 
  170             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  171             dst += (4 * dst_stride);
 
  173     } 
else if (0 == height % 8) {
 
  175     } 
else if (0 == height % 4) {
 
  176         for (cnt = (height >> 2); cnt--;) {
 
  177             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  178             src += (4 * src_stride);
 
  180             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  181             dst += (4 * dst_stride);
 
  199     v16u8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  201     if (0 == height % 12) {
 
  202         for (cnt = (height / 12); cnt--;) {
 
  203             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  204             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
  205             src += (4 * src_stride);
 
  206             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  207             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
  208             dst += (4 * dst_stride);
 
  210             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  211             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
  212             src += (4 * src_stride);
 
  213             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  214             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
  215             dst += (4 * dst_stride);
 
  217             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  218             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
  219             src += (4 * src_stride);
 
  220             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  221             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
  222             dst += (4 * dst_stride);
 
  224     } 
else if (0 == height % 8) {
 
  226     } 
else if (0 == height % 4) {
 
  227         for (cnt = (height >> 2); cnt--;) {
 
  228             LD_UB4(src, src_stride, src0, src1, src2, src3);
 
  229             LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
 
  230             src += (4 * src_stride);
 
  231             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
 
  232             ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
 
  233             dst += (4 * dst_stride);
 
  254     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
  256     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 
  258     8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
  261 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \ 
  262                             filt0, filt1, filt2, filt3)         \ 
  266     tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \ 
  267     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \ 
  268     tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \ 
  269     tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \ 
  270     tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \ 
  275 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \ 
  276                                    mask0, mask1, mask2, mask3,              \ 
  277                                    filt0, filt1, filt2, filt3,              \ 
  280     v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \ 
  281     v8i16 res0_m, res1_m, res2_m, res3_m;                                   \ 
  283     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \ 
  284     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \ 
  285     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \ 
  286     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \ 
  287     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \ 
  288     DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \ 
  289     VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \ 
  290     DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \ 
  291     ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \ 
  294 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \ 
  295                                    mask0, mask1, mask2, mask3,                \ 
  296                                    filt0, filt1, filt2, filt3,                \ 
  297                                    out0, out1, out2, out3)                    \ 
  299     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \ 
  300     v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \ 
  302     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \ 
  303     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \ 
  304     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \ 
  305                 res0_m, res1_m, res2_m, res3_m);                              \ 
  306     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \ 
  307     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \ 
  308     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \ 
  309                 res4_m, res5_m, res6_m, res7_m);                              \ 
  310     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \ 
  311     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \ 
  312     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \ 
  313                  res0_m, res1_m, res2_m, res3_m);                             \ 
  314     VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \ 
  315     VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \ 
  316     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \ 
  317                  res4_m, res5_m, res6_m, res7_m);                             \ 
  318     ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \ 
  319                 res7_m, out0, out1, out2, out3);                              \ 
  322 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \ 
  326     tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \ 
  327     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \ 
  332 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \ 
  333                                    mask0, mask1, filt0, filt1,         \ 
  336     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \ 
  338     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \ 
  339     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \ 
  340     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \ 
  341     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \ 
  344 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \ 
  345                                    mask0, mask1, filt0, filt1,                \ 
  346                                    out0, out1, out2, out3)                    \ 
  348     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \ 
  350     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \ 
  351     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \ 
  352     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \ 
  353                 out0, out1, out2, out3);                                      \ 
  354     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \ 
  355     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \ 
  356     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \ 
  357                  out0, out1, out2, out3);                                     \ 
  364     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  365     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  366     v8i16 
filt, out0, out1;
 
  371     rnd_vec = __msa_fill_h(rnd_val);
 
  374     filt = 
LD_SH(filter);
 
  375     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  381     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  384                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  388     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  395     v16i8 filt0, filt1, filt2, filt3;
 
  397     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  398     v8i16 
filt, out0, out1, out2, out3;
 
  403     rnd_vec = __msa_fill_h(rnd_val);
 
  406     filt = 
LD_SH(filter);
 
  407     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  413     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  415     src += (4 * src_stride);
 
  417                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  418     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  421                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  425     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  426     dst += (4 * dst_stride);
 
  428     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  435     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  436     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  437     v8i16 
filt, out0, out1, out2, out3;
 
  442     rnd_vec = __msa_fill_h(rnd_val);
 
  445     filt = 
LD_SH(filter);
 
  446     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  452     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  454     src += (4 * src_stride);
 
  456                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  457     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  459     src += (4 * src_stride);
 
  461                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  465     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  466     dst += (4 * dst_stride);
 
  468     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  469     dst += (4 * dst_stride);
 
  471     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  473     src += (4 * src_stride);
 
  475                                mask3, filt0, filt1, filt2, filt3, out0, out1);
 
  476     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  478     src += (4 * src_stride);
 
  480                                mask3, filt0, filt1, filt2, filt3, out2, out3);
 
  485     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  486     dst += (4 * dst_stride);
 
  488     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
  497     } 
else if (8 == height) {
 
  499     } 
else if (16 == height) {
 
  509     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  510     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  511     v8i16 
filt, out0, out1, out2, out3;
 
  516     rnd_vec = __msa_fill_h(rnd_val);
 
  519     filt = 
LD_SH(filter);
 
  520     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  526     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  529                                mask3, filt0, filt1, filt2, filt3, out0, out1,
 
  535     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
  544     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  545     v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
 
  546     v8i16 
filt, out0, out1, out2, out3;
 
  551     rnd_vec = __msa_fill_h(rnd_val);
 
  554     filt = 
LD_SH(filter);
 
  555     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  561     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  562         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  564         src += (4 * src_stride);
 
  566                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  572         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
  573         dst += (4 * dst_stride);
 
  597     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  598     v8i16 
filt, out0, out1, out2, out3;
 
  599     v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
 
  604     rnd_vec = __msa_fill_h(rnd_val);
 
  613     filt = 
LD_SH(filter);
 
  614     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  623     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  625         LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
 
  627         src1_ptr += (4 * src_stride);
 
  629                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  635         ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
 
  636         dst1 += (4 * dst_stride);
 
  639         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  641         src += (4 * src_stride);
 
  643                                    mask6, filt0, filt1, filt2, filt3, out0,
 
  648         ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
 
  649         dst += (4 * dst_stride);
 
  659     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  660     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  661     v8i16 
filt, out0, out1, out2, out3;
 
  666     rnd_vec = __msa_fill_h(rnd_val);
 
  669     filt = 
LD_SH(filter);
 
  670     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  676     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  677         LD_SB2(src, src_stride, src0, src2);
 
  678         LD_SB2(src + 8, src_stride, src1, src3);
 
  680         src += (2 * src_stride);
 
  682                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  701     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  702     v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, 
out;
 
  703     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  705     v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
 
  711     rnd_vec = __msa_fill_h(rnd_val);
 
  714     filt = 
LD_SH(filter);
 
  715     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  725     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  726         LD_SB2(src, src_stride, src0, src2);
 
  727         LD_SB2(src + 16, src_stride, src1, src3);
 
  729         src += (2 * src_stride);
 
  730         VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
 
  731         VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
 
  732         VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
 
  733         DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
 
  736         VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
 
  737         VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
 
  738         VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
 
  739         DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
 
  742         VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
 
  743         VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
 
  744         VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
 
  745         DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
 
  746                      out0, out8, out2, out9);
 
  748         VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
 
  749         VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
 
  750         VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
 
  751         DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
 
  752                      out4, out10, out6, out11);
 
  754         ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
 
  762         ST8x2_UB(out, dst + 16, dst_stride);
 
  778     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  779     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  780     v8i16 
filt, out0, out1, out2, out3;
 
  785     rnd_vec = __msa_fill_h(rnd_val);
 
  788     filt = 
LD_SH(filter);
 
  789     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  795     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  797         src2 = 
LD_SB(src + 16);
 
  798         src3 = 
LD_SB(src + 24);
 
  799         src1 = __msa_sldi_b(src2, src0, 8);
 
  803                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  809         src2 = 
LD_SB(src + 16);
 
  810         src3 = 
LD_SB(src + 24);
 
  811         src1 = __msa_sldi_b(src2, src0, 8);
 
  817         ST_UB(out, dst + 16);
 
  822                                    mask3, filt0, filt1, filt2, filt3, out0,
 
  829         ST_UB(out, dst + 16);
 
  840     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
 
  841     v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, 
out;
 
  842     v8i16 
filt, out0, out1, out2, out3, out4, out5, out6;
 
  847     rnd_vec = __msa_fill_h(rnd_val);
 
  850     filt = 
LD_SH(filter);
 
  851     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  861     for (loop_cnt = height; loop_cnt--;) {
 
  862         LD_SB3(src, 16, src0, src2, src3);
 
  863         src1 = __msa_sldi_b(src2, src0, 8);
 
  866         VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
 
  868         DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
 
  869         VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
 
  872         out2 = __msa_dpadd_s_h(out2, vec2, filt1);
 
  873         VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
 
  875         DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
 
  876         VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
 
  879         out5 = __msa_dpadd_s_h(out5, vec2, filt3);
 
  881         out2 = __msa_adds_s_h(out2, out5);
 
  883         out6 = __msa_srar_h(out2, rnd_vec);
 
  888         src1 = 
LD_SB(src + 40);
 
  890         src1 = (v16i8) __msa_xori_b((v16u8) 
src1, 128);
 
  892         VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
 
  894         DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
 
  895         VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
 
  898         out2 = __msa_dpadd_s_h(out2, vec2, filt1);
 
  899         VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
 
  901         DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
 
  902         VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
 
  905         out5 = __msa_dpadd_s_h(out5, vec2, filt3);
 
  907         out5 = __msa_adds_s_h(out2, out5);
 
  911         ST_UB(out, dst + 16);
 
  913         ST_UB(out, dst + 32);
 
  924     v16i8 
src0, 
src1, src2, src3, filt0, filt1, filt2, filt3;
 
  925     v16u8 mask0, mask1, mask2, mask3, 
out;
 
  926     v8i16 
filt, out0, out1, out2, out3;
 
  931     rnd_vec = __msa_fill_h(rnd_val);
 
  934     filt = 
LD_SH(filter);
 
  935     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  941     for (loop_cnt = height; loop_cnt--;) {
 
  943         src2 = 
LD_SB(src + 16);
 
  944         src3 = 
LD_SB(src + 24);
 
  945         src1 = __msa_sldi_b(src2, src0, 8);
 
  949                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  950                                    out0, out1, out2, out3);
 
  956         ST_UB(out, dst + 16);
 
  958         src0 = 
LD_SB(src + 32);
 
  959         src2 = 
LD_SB(src + 48);
 
  960         src3 = 
LD_SB(src + 56);
 
  961         src1 = __msa_sldi_b(src2, src0, 8);
 
  966                                    mask2, mask3, filt0, filt1, filt2, filt3,
 
  967                                    out0, out1, out2, out3);
 
  971         ST_UB(out, dst + 32);
 
  973         ST_UB(out, dst + 48);
 
  984     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
  985     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
  986     v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
 
  987     v16i8 src10998, filt0, filt1, filt2, filt3;
 
  989     v8i16 
filt, out10, out32;
 
  992     src -= (3 * src_stride);
 
  993     rnd_vec = __msa_fill_h(rnd_val);
 
  995     filt = 
LD_SH(filter);
 
  996     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  998     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
  999     src += (7 * src_stride);
 
 1001     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
 1003     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1004     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
 
 1008     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1009         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1010         src += (4 * src_stride);
 
 1012         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1013                    src87_r, src98_r, src109_r);
 
 1014         ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
 
 1017                                     filt1, filt2, filt3);
 
 1019                                     filt1, filt2, filt3);
 
 1023         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1024         dst += (4 * dst_stride);
 
 1039     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1040     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1041     v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
 
 1043     v8i16 
filt, out0_r, out1_r, out2_r, out3_r;
 
 1046     src -= (3 * src_stride);
 
 1047     rnd_vec = __msa_fill_h(rnd_val);
 
 1049     filt = 
LD_SH(filter);
 
 1050     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1052     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1054     src += (7 * src_stride);
 
 1055     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
 1057     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1059     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1060         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1062         src += (4 * src_stride);
 
 1064         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1065                    src87_r, src98_r, src109_r);
 
 1067                                      filt1, filt2, filt3);
 
 1069                                      filt1, filt2, filt3);
 
 1071                                      filt1, filt2, filt3);
 
 1073                                      filt1, filt2, filt3);
 
 1074         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 1075         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1078         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
 1079         dst += (4 * dst_stride);
 
 1097     uint32_t out2, out3;
 
 1098     uint64_t out0, out1;
 
 1099     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
 
 1100     v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1101     v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
 
 1102     v8i16 
filt, filt0, filt1, filt2, filt3;
 
 1104     v4i32 
mask = { 2, 6, 2, 6 };
 
 1106     src -= (3 * src_stride);
 
 1107     rnd_vec = __msa_fill_h(rnd_val);
 
 1110     filt = 
LD_SH(filter);
 
 1111     SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1113     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1114     src += (7 * src_stride);
 
 1119     VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
 
 1120     VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
 
 1121     VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
 
 1123     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 1124         LD_SB2(src, src_stride, src7, src8);
 
 1126         src += (2 * src_stride);
 
 1128         ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
 
 1129                    vec01, vec23, vec45, vec67);
 
 1132         ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
 
 1138         VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
 
 1139         ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
 
 1145         PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
 
 1148         out0 = __msa_copy_u_d((v2i64) res0, 0);
 
 1149         out1 = __msa_copy_u_d((v2i64) res1, 0);
 
 1150         out2 = __msa_copy_u_w((v4i32) res2, 0);
 
 1151         out3 = __msa_copy_u_w((v4i32) res2, 1);
 
 1153         SW(out2, (dst + 8));
 
 1156         SW(out3, (dst + 8));
 
 1181     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1182     v16i8 filt0, filt1, filt2, filt3;
 
 1183     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1184     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
 1185     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1186     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1187     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 1190     src -= (3 * src_stride);
 
 1191     rnd_vec = __msa_fill_h(rnd_val);
 
 1193     filt = 
LD_SH(filter);
 
 1194     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1196     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1198     src += (7 * src_stride);
 
 1199     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 
 1201     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1202     ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
 
 1204     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1206     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1207         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1209         src += (4 * src_stride);
 
 1211         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1212                    src87_r, src98_r, src109_r);
 
 1213         ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
 1214                    src87_l, src98_l, src109_l);
 
 1216                                      filt1, filt2, filt3);
 
 1218                                      filt1, filt2, filt3);
 
 1220                                      filt1, filt2, filt3);
 
 1222                                      filt1, filt2, filt3);
 
 1224                                      filt1, filt2, filt3);
 
 1226                                      filt1, filt2, filt3);
 
 1228                                      filt1, filt2, filt3);
 
 1230                                      filt1, filt2, filt3);
 
 1231         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 1232         SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
 
 1233         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1234         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1235         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1236                     out3_r, tmp0, tmp1, tmp2, tmp3);
 
 1238         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 
 1239         dst += (4 * dst_stride);
 
 1264     uint32_t loop_cnt, cnt;
 
 1265     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1266     v16i8 filt0, filt1, filt2, filt3;
 
 1267     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
 
 1268     v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
 
 1269     v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1270     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1271     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 1274     src -= (3 * src_stride);
 
 1275     rnd_vec = __msa_fill_h(rnd_val);
 
 1277     filt = 
LD_SH(filter);
 
 1278     SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1280     for (cnt = (width >> 4); cnt--;) {
 
 1284         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1286         src_tmp += (7 * src_stride);
 
 1287         ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
 
 1288                    src32_r, src54_r, src21_r);
 
 1289         ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1290         ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
 
 1291                    src32_l, src54_l, src21_l);
 
 1292         ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1294         for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1295             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
 1297             src_tmp += (4 * src_stride);
 
 1298             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 
 1299                        src87_r, src98_r, src109_r);
 
 1300             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
 
 1301                        src87_l, src98_l, src109_l);
 
 1303                                          filt0, filt1, filt2, filt3);
 
 1305                                          filt0, filt1, filt2, filt3);
 
 1307                                          filt0, filt1, filt2, filt3);
 
 1309                                          filt0, filt1, filt2, filt3);
 
 1311                                          filt0, filt1, filt2, filt3);
 
 1313                                          filt0, filt1, filt2, filt3);
 
 1315                                          filt0, filt1, filt2, filt3);
 
 1317                                          filt0, filt1, filt2, filt3);
 
 1318             SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 1319             SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
 
 1320             SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 1321             SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 1322             PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 1323                         out3_r, tmp0, tmp1, tmp2, tmp3);
 
 1325             ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
 
 1326             dst_tmp += (4 * dst_stride);
 
 1387                                   const int8_t *filter_x,
 
 1388                                   const int8_t *filter_y,
 
 1392     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1393     v8i16 filt0, filt1, filt2, filt3;
 
 1394     v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1395     v16i8 mask1, mask2, mask3;
 
 1396     v8i16 filter_vec, const_vec;
 
 1397     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1398     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1399     v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
 
 1400     v4i32 dst0_r, dst1_r;
 
 1401     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 1402     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 1403     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
 
 1404     v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
 
 1406     src -= ((3 * src_stride) + 3);
 
 1407     filter_vec = 
LD_SH(filter_x);
 
 1408     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1410     filter_vec = 
LD_SH(filter_y);
 
 1411     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 1412     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 1414     SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1420     const_vec = __msa_ldi_h(128);
 
 1423     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1424     src += (7 * src_stride);
 
 1427     VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 1428     VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 1429     VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
 
 1430                vec8, vec9, vec10, vec11);
 
 1431     VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
 
 1432                vec12, vec13, vec14, vec15);
 
 1435     DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1436                  dst30, dst30, dst30, dst30);
 
 1438     DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1439                  dst41, dst41, dst41, dst41);
 
 1441     DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1442                  dst52, dst52, dst52, dst52);
 
 1444     DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
 
 1445                  dst63, dst63, dst63, dst63);
 
 1447     ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
 
 1448                dst10_r, dst21_r, dst32_r);
 
 1449     dst43_r = __msa_ilvl_h(dst41, dst30);
 
 1450     dst54_r = __msa_ilvl_h(dst52, dst41);
 
 1451     dst65_r = __msa_ilvl_h(dst63, dst52);
 
 1452     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 1454     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1455         LD_SB2(src, src_stride, src7, src8);
 
 1456         src += 2 * src_stride;
 
 1459         VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
 
 1460                    vec0, vec1, vec2, vec3);
 
 1462         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1463                      dst87, dst87, dst87, dst87);
 
 1465         dst76_r = __msa_ilvr_h(dst87, dst66);
 
 1467                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 1468         dst87_r = __msa_vshf_h(mask4, dst87, dst87);
 
 1470                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 1480         dst += (2 * dst_stride);
 
 1488         dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
 
 1496                                            const int8_t *filter_x,
 
 1497                                            const int8_t *filter_y,
 
 1500     uint32_t loop_cnt, cnt;
 
 1503     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1504     v8i16 filt0, filt1, filt2, filt3;
 
 1505     v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1506     v16i8 mask1, mask2, mask3;
 
 1507     v8i16 filter_vec, const_vec;
 
 1508     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1509     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1510     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 1511     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 1512     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 1513     v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
 
 1514     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 1515     v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
 
 1516     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 1518     src -= ((3 * src_stride) + 3);
 
 1519     const_vec = __msa_ldi_h(128);
 
 1522     filter_vec = 
LD_SH(filter_x);
 
 1523     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1525     filter_vec = 
LD_SH(filter_y);
 
 1526     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 1527     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 1529     SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1535     for (cnt = width >> 3; cnt--;) {
 
 1539         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1540         src_tmp += (7 * src_stride);
 
 1544         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1545                    vec0, vec1, vec2, vec3);
 
 1546         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1547                    vec4, vec5, vec6, vec7);
 
 1548         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1549                    vec8, vec9, vec10, vec11);
 
 1550         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1551                    vec12, vec13, vec14, vec15);
 
 1553         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1554                      dst0, dst0, dst0, dst0);
 
 1556         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1557                      dst1, dst1, dst1, dst1);
 
 1559         DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1560                      dst2, dst2, dst2, dst2);
 
 1562         DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
 
 1563                      dst3, dst3, dst3, dst3);
 
 1565         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1566                    vec0, vec1, vec2, vec3);
 
 1567         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 1568                    vec4, vec5, vec6, vec7);
 
 1569         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 1570                    vec8, vec9, vec10, vec11);
 
 1572         DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1573                      dst4, dst4, dst4, dst4);
 
 1575         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
 
 1576                      dst5, dst5, dst5, dst5);
 
 1578         DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
 
 1579                      dst6, dst6, dst6, dst6);
 
 1581         ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 1582                    dst10_r, dst32_r, dst54_r, dst21_r);
 
 1583         ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
 
 1584         ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 1585                    dst10_l, dst32_l, dst54_l, dst21_l);
 
 1586         ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
 
 1588         for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1589             LD_SB2(src_tmp, src_stride, src7, src8);
 
 1591             src_tmp += 2 * src_stride;
 
 1593             VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 1594                        vec0, vec1, vec2, vec3);
 
 1596             DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1597                          dst7, dst7, dst7, dst7);
 
 1601                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1603                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1607             VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
 
 1608                        vec0, vec1, vec2, vec3);
 
 1610             DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
 
 1611                          dst8, dst8, dst8, dst8);
 
 1615                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1617                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1627             ST8x2_UB(dst0_r, dst_tmp, dst_stride);
 
 1628             dst_tmp += (2 * dst_stride);
 
 1654                                   const int8_t *filter_x,
 
 1655                                   const int8_t *filter_y,
 
 1659                                    filter_x, filter_y, height, 8);
 
 1666                                    const int8_t *filter_x,
 
 1667                                    const int8_t *filter_y,
 
 1671                                    filter_x, filter_y, height, 8);
 
 1674                           filter_x, filter_y, height);
 
 1681                                    const int8_t *filter_x,
 
 1682                                    const int8_t *filter_y,
 
 1686                                    filter_x, filter_y, height, 16);
 
 1693                                    const int8_t *filter_x,
 
 1694                                    const int8_t *filter_y,
 
 1698                                    filter_x, filter_y, height, 24);
 
 1705                                    const int8_t *filter_x,
 
 1706                                    const int8_t *filter_y,
 
 1710                                    filter_x, filter_y, height, 32);
 
 1717                                    const int8_t *filter_x,
 
 1718                                    const int8_t *filter_y,
 
 1722                                    filter_x, filter_y, height, 48);
 
 1729                                    const int8_t *filter_x,
 
 1730                                    const int8_t *filter_y,
 
 1734                                    filter_x, filter_y, height, 64);
 
 1741     v16i8 filt0, filt1, 
src0, 
src1, mask0, mask1, vec0, vec1;
 
 1748     rnd_vec = __msa_fill_h(rnd_val);
 
 1751     filt = 
LD_SH(filter);
 
 1756     LD_SB2(src, src_stride, src0, src1);
 
 1758     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 1760     res0 = __msa_srar_h(res0, rnd_vec);
 
 1761     res0 = __msa_sat_s_h(res0, 7);
 
 1770     v16i8 
src0, 
src1, src2, src3, filt0, filt1, mask0, mask1;
 
 1771     v8i16 
filt, out0, out1;
 
 1777     rnd_vec = __msa_fill_h(rnd_val);
 
 1780     filt = 
LD_SH(filter);
 
 1785     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1788                                filt0, filt1, out0, out1);
 
 1792     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1799     v16i8 
src0, 
src1, src2, src3, filt0, filt1, mask0, mask1;
 
 1801     v8i16 
filt, out0, out1, out2, out3;
 
 1806     rnd_vec = __msa_fill_h(rnd_val);
 
 1809     filt = 
LD_SH(filter);
 
 1814     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1815     src += (4 * src_stride);
 
 1819                                filt0, filt1, out0, out1);
 
 1820     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1823                                filt0, filt1, out2, out3);
 
 1827     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1828     dst += (4 * dst_stride);
 
 1830     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1837     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 1838     v16i8 filt0, filt1, mask0, mask1;
 
 1840     v8i16 
filt, out0, out1, out2, out3;
 
 1845     rnd_vec = __msa_fill_h(rnd_val);
 
 1848     filt = 
LD_SH(filter);
 
 1853     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 1854     src += (8 * src_stride);
 
 1857                                filt0, filt1, out0, out1);
 
 1859                                filt0, filt1, out2, out3);
 
 1863     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1864     dst += (4 * dst_stride);
 
 1866     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1867     dst += (4 * dst_stride);
 
 1869     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 1870     src += (8 * src_stride);
 
 1873                                filt0, filt1, out0, out1);
 
 1875                                filt0, filt1, out2, out3);
 
 1879     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1880     dst += (4 * dst_stride);
 
 1882     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1892     } 
else if (4 == height) {
 
 1894     } 
else if (8 == height) {
 
 1896     } 
else if (16 == height) {
 
 1908     v16i8 
src0, 
src1, src2, src3, filt0, filt1, mask0, mask1;
 
 1910     v8i16 
filt, out0, out1, out2, out3;
 
 1915     rnd_vec = __msa_fill_h(rnd_val);
 
 1918     filt = 
LD_SH(filter);
 
 1923     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1924         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 1925         src += (4 * src_stride);
 
 1929                                    filt1, out0, out1, out2, out3);
 
 1935         ST6x4_UB(out4, out5, dst, dst_stride);
 
 1936         dst += (4 * dst_stride);
 
 1946     v16i8 
src0, 
src1, filt0, filt1, mask0, mask1;
 
 1948     v8i16 
filt, vec0, vec1, vec2, vec3;
 
 1953     rnd_vec = __msa_fill_h(rnd_val);
 
 1955     filt = 
LD_SH(filter);
 
 1960     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 1961         LD_SB2(src, src_stride, src0, src1);
 
 1962         src += (2 * src_stride);
 
 1965         VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
 
 1966         DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
 
 1967         VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
 
 1973         dst += (2 * dst_stride);
 
 1983     v16i8 
src0, 
src1, src2, src3, filt0, filt1, mask0, mask1;
 
 1985     v8i16 
filt, out0, out1, out2, out3;
 
 1990     rnd_vec = __msa_fill_h(rnd_val);
 
 1993     filt = 
LD_SH(filter);
 
 1998     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1999         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2000         src += (4 * src_stride);
 
 2004                                    filt1, out0, out1, out2, out3);
 
 2009         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
 2010         dst += (4 * dst_stride);
 
 2019     if ((2 == height) || (6 == height)) {
 
 2034     v16i8 
src0, 
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
 
 2035     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
 
 2038     v8i16 
filt, out0, out1, out2, out3, out4, out5;
 
 2047     filt = 
LD_SH(filter);
 
 2053     rnd_vec = __msa_fill_h(rnd_val);
 
 2055     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2056         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2057         src += (4 * src_stride);
 
 2060         VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
 
 2061         VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
 
 2062         VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
 
 2063         DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
 
 2064                     out2, out3, out4, out5);
 
 2065         DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
 
 2066         VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
 
 2067         VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
 
 2068         VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
 
 2069         DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
 
 2070                      out2, out3, out4, out5);
 
 2078         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
 2080         ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
 
 2081         dst += (4 * dst_stride);
 
 2091     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2092     v16i8 filt0, filt1, mask0, mask1;
 
 2093     v8i16 
filt, out0, out1, out2, out3, out4, out5, out6, out7;
 
 2099     rnd_vec = __msa_fill_h(rnd_val);
 
 2102     filt = 
LD_SH(filter);
 
 2107     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2108         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2109         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 2110         src += (4 * src_stride);
 
 2114                                    filt1, out0, out1, out2, out3);
 
 2116                                    filt1, out4, out5, out6, out7);
 
 2143     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2144     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2145     v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
 
 2146     v8i16 
filt, out0, out1, out2, out3;
 
 2152     rnd_vec = __msa_fill_h(rnd_val);
 
 2155     filt = 
LD_SH(filter);
 
 2160     mask11 = mask0 + 10;
 
 2162     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2163         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 2164         LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
 
 2165         src += (4 * src_stride);
 
 2168         VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
 
 2169         VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
 
 2170         VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
 
 2171         VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
 
 2172         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2173                     out0, out1, out2, out3);
 
 2174         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
 
 2175                      out0, out1, out2, out3);
 
 2185         VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
 
 2186         VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
 
 2187         VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
 
 2188         VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
 
 2189         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2190                     out0, out1, out2, out3);
 
 2191         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
 
 2192                      out0, out1, out2, out3);
 
 2203         VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
 
 2204         VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
 
 2205         VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
 
 2206         VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
 
 2208         DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
 
 2209                     out0, out1, out2, out3);
 
 2210         DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
 
 2211                      out0, out1, out2, out3);
 
 2217         ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
 
 2218         dst1 += (4 * dst_stride);
 
 2228     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2229     v16i8 filt0, filt1, mask0, mask1;
 
 2231     v8i16 
filt, out0, out1, out2, out3, out4, out5, out6, out7;
 
 2236     rnd_vec = __msa_fill_h(rnd_val);
 
 2239     filt = 
LD_SH(filter);
 
 2244     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 2246         src2 = 
LD_SB(src + 16);
 
 2247         src3 = 
LD_SB(src + 24);
 
 2250         src6 = 
LD_SB(src + 16);
 
 2251         src7 = 
LD_SB(src + 24);
 
 2252         SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
 
 2257                                    filt0, filt1, out0, out1, out2, out3);
 
 2259                                    filt0, filt1, out4, out5, out6, out7);
 
 2267         ST_UB(out, dst + 16);
 
 2272         ST_UB(out, dst + 16);
 
 2281     v16i8 
src0, 
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
 
 2282     v16i8 src2110, src4332, filt0, filt1;
 
 2288     rnd_vec = __msa_fill_h(rnd_val);
 
 2290     filt = 
LD_SH(filter);
 
 2293     LD_SB3(src, src_stride, src0, src1, src2);
 
 2294     src += (3 * src_stride);
 
 2296     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2297     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 2298     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 2299     LD_SB2(src, src_stride, src3, src4);
 
 2300     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 2301     src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
 
 2302     src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
 
 2304     out10 = __msa_srar_h(out10, rnd_vec);
 
 2305     out10 = __msa_sat_s_h(out10, 7);
 
 2316     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 2317     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
 
 2318     v16i8 src2110, src4332, filt0, filt1;
 
 2319     v8i16 
filt, out10, out32;
 
 2324     rnd_vec = __msa_fill_h(rnd_val);
 
 2326     filt = 
LD_SH(filter);
 
 2329     LD_SB3(src, src_stride, src0, src1, src2);
 
 2330     src += (3 * src_stride);
 
 2332     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2334     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 2335     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 2337     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2338         LD_SB3(src, src_stride, src3, src4, src5);
 
 2339         src += (3 * src_stride);
 
 2340         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 2341         src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
 
 2342         src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
 
 2346         src += (src_stride);
 
 2347         ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
 
 2348         src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
 
 2349         src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 2354         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 2355         dst += (4 * dst_stride);
 
 2378     v16u8 
src0, 
src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
 
 2379     v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
 
 2380     v8i16 
filt, filt0, filt1;
 
 2384     rnd_vec = __msa_fill_h(rnd_val);
 
 2387     filt = 
LD_SH(filter);
 
 2390     LD_UB3(src, src_stride, src0, src1, src2);
 
 2391     src += (3 * src_stride);
 
 2393     vec0 = (v16u8) __msa_xori_b((v16u8) 
src0, 128);
 
 2394     vec1 = (v16u8) __msa_xori_b((v16u8) 
src1, 128);
 
 2395     vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
 
 2397     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2398         LD_UB4(src, src_stride, src3, src0, src1, src2);
 
 2399         src += (4 * src_stride);
 
 2401         vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
 
 2402         ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
 
 2405         vec0 = __msa_xori_b((v16u8) src0, 128);
 
 2406         ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
 
 2409         vec1 = __msa_xori_b((v16u8) src1, 128);
 
 2410         vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
 
 2413         vec2 = __msa_xori_b((v16u8) src2, 128);
 
 2414         vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
 
 2421         ST6x4_UB(out0, out1, dst, dst_stride);
 
 2422         dst += (4 * dst_stride);
 
 2430     v16i8 
src0, 
src1, src2, src3, src4;
 
 2431     v8i16 src01, src12, src23, src34, tmp0, tmp1, 
filt, filt0, filt1;
 
 2436     rnd_vec = __msa_fill_h(rnd_val);
 
 2439     filt = 
LD_SH(filter);
 
 2442     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 2444     ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
 
 2446     ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
 
 2459     uint64_t out0, out1, out2;
 
 2460     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 2461     v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
 
 2462     v8i16 
filt, filt0, filt1;
 
 2466     rnd_vec = __msa_fill_h(rnd_val);
 
 2469     filt = 
LD_SH(filter);
 
 2472     LD_SB3(src, src_stride, src0, src1, src2);
 
 2473     src += (3 * src_stride);
 
 2476     ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
 
 2478     for (loop_cnt = 2; loop_cnt--;) {
 
 2479         LD_SB3(src, src_stride, src3, src4, src5);
 
 2480         src += (3 * src_stride);
 
 2483         ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
 
 2492         out0 = __msa_copy_u_d((v2i64) tmp0, 0);
 
 2493         out1 = __msa_copy_u_d((v2i64) tmp0, 1);
 
 2494         out2 = __msa_copy_u_d((v2i64) tmp2, 0);
 
 2514     v16i8 
src0, 
src1, src2, src7, src8, src9, src10;
 
 2515     v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
 
 2517     v8i16 
filt, out0_r, out1_r, out2_r, out3_r;
 
 2521     rnd_vec = __msa_fill_h(rnd_val);
 
 2523     filt = 
LD_SH(filter);
 
 2526     LD_SB3(src, src_stride, src0, src1, src2);
 
 2527     src += (3 * src_stride);
 
 2530     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2532     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2533         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 2534         src += (4 * src_stride);
 
 2537         ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
 
 2538                    src72_r, src87_r, src98_r, src109_r);
 
 2543         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 2544         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2547         ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 
 2548         dst += (4 * dst_stride);
 
 2563     } 
else if (6 == height) {
 
 2567                                  filter, height, rnd_val);
 
 2577     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 2578     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 2580     v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
 
 2581     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, 
filt, filt0, filt1;
 
 2582     v4u32 
mask = { 2, 6, 2, 6 };
 
 2586     filt = 
LD_SH(filter);
 
 2589     rnd_vec = __msa_fill_h(rnd_val);
 
 2593     LD_SB3(src, src_stride, src0, src1, src2);
 
 2594     src += (3 * src_stride);
 
 2597     VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
 
 2599     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2600         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 2601         src += (4 * src_stride);
 
 2604         ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
 
 2605         VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
 
 2606         VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
 
 2608         ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
 
 2609                    src21, src43, src54, src65);
 
 2613         ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
 
 2622         ST8x4_UB(out0, out1, dst, dst_stride);
 
 2624         ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
 
 2625         dst += (4 * dst_stride);
 
 2642     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 2643     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
 
 2644     v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
 
 2645     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 2646     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 2650     rnd_vec = __msa_fill_h(rnd_val);
 
 2652     filt = 
LD_SH(filter);
 
 2655     LD_SB3(src, src_stride, src0, src1, src2);
 
 2656     src += (3 * src_stride);
 
 2659     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2660     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 2662     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2663         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 2664         src += (4 * src_stride);
 
 2667         ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 2668                    src32_r, src43_r, src54_r, src65_r);
 
 2669         ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 2670                    src32_l, src43_l, src54_l, src65_l);
 
 2679         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 2680         SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
 
 2681         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2682         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
 
 2683         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 
 2684                     out3_r, tmp0, tmp1, tmp2, tmp3);
 
 2686         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 
 2687         dst += (4 * dst_stride);
 
 2703     uint64_t out0, out1;
 
 2704     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2705     v16i8 src11, filt0, filt1;
 
 2706     v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
 
 2707     v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
 
 2709     v8i16 
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
 
 2714     filt = 
LD_SH(filter);
 
 2717     rnd_vec = __msa_fill_h(rnd_val);
 
 2720     LD_SB3(src, src_stride, src0, src1, src2);
 
 2722     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2723     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 2726     LD_SB3(src + 16, src_stride, src6, src7, src8);
 
 2727     src += (3 * src_stride);
 
 2729     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 2731     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 2733         LD_SB2(src, src_stride, src3, src4);
 
 2735         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 2736         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 2739         LD_SB2(src + 16, src_stride, src9, src10);
 
 2740         src += (2 * src_stride);
 
 2742         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 2755         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 2757         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2761         PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
 
 2763         out0 = __msa_copy_u_d((v2i64) out2_r, 0);
 
 2764         out1 = __msa_copy_u_d((v2i64) out3_r, 0);
 
 2773         LD_SB2(src, src_stride, src5, src2);
 
 2775         ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
 
 2776         ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
 
 2779         LD_SB2(src + 16, src_stride, src11, src8);
 
 2780         src += (2 * src_stride);
 
 2782         ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
 
 2795         SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
 
 2797         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
 
 2817     uint32_t loop_cnt, cnt;
 
 2819     v16i8 
src0, 
src1, src2, src3, src4, src6, src7, src8, src9, src10;
 
 2820     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 2821     v16i8 src21_r, src43_r, src87_r, src109_r;
 
 2822     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
 2823     v16i8 src10_l, src32_l, src76_l, src98_l;
 
 2824     v16i8 src21_l, src43_l, src87_l, src109_l;
 
 2831     rnd_vec = __msa_fill_h(rnd_val);
 
 2833     filt = 
LD_SH(filter);
 
 2836     for (cnt = (width >> 5); cnt--;) {
 
 2841         LD_SB3(src_tmp, src_stride, src0, src1, src2);
 
 2844         ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 2845         ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 2848         LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
 
 2849         src_tmp += (3 * src_stride);
 
 2852         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 2853         ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
 
 2855         for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 2857             LD_SB2(src_tmp, src_stride, src3, src4);
 
 2859             ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 2860             ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 2869             SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
 
 2870             SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
 
 2872             ST_UB(out, dst_tmp);
 
 2874             ST_UB(out, dst_tmp + dst_stride);
 
 2883             LD_SB2(src_tmp + 16, src_stride, src9, src10);
 
 2884             src_tmp += (2 * src_stride);
 
 2886             ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 2887             ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
 
 2896             SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
 
 2897             SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
 
 2899             ST_UB(out, dst_tmp + 16);
 
 2901             ST_UB(out, dst_tmp + 16 + dst_stride);
 
 2903             dst_tmp += 2 * dst_stride;
 
 2923                               filter, height, rnd_val, 32);
 
 2930                                    const int8_t *filter_x,
 
 2931                                    const int8_t *filter_y,
 
 2934     v16i8 
src0, 
src1, src2, src3, src4;
 
 2936     v4i32 filt_h0, filt_h1;
 
 2937     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 2939     v8i16 filter_vec, const_vec;
 
 2940     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 2941     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 2942     v4i32 dst0_r, dst1_r;
 
 2943     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 2945     src -= (src_stride + 1);
 
 2947     filter_vec = 
LD_SH(filter_x);
 
 2950     filter_vec = 
LD_SH(filter_y);
 
 2951     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 2952     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 2958     const_vec = __msa_ldi_h(128);
 
 2961     LD_SB3(src, src_stride, src0, src1, src2);
 
 2962     src += (3 * src_stride);
 
 2966     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2967     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2968     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2977     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 2978     LD_SB2(src, src_stride, src3, src4);
 
 2982     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 2986     dst32_r = __msa_ilvr_h(dst3, dst2);
 
 2991     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 2995     dst43_r = __msa_ilvr_h(dst4, dst3);
 
 2999     dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
 
 3000     dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
 
 3002     dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
 
 3011                                    const int8_t *filter_x,
 
 3012                                    const int8_t *filter_y,
 
 3015     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 3017     v4i32 filt_h0, filt_h1;
 
 3018     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3020     v8i16 filter_vec, const_vec;
 
 3021     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3022     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3023     v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
 
 3024     v8i16 out0_r, out1_r;
 
 3025     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 3027     src -= (src_stride + 1);
 
 3029     filter_vec = 
LD_SH(filter_x);
 
 3032     filter_vec = 
LD_SH(filter_y);
 
 3033     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3034     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3040     const_vec = __msa_ldi_h(128);
 
 3043     LD_SB3(src, src_stride, src0, src1, src2);
 
 3044     src += (3 * src_stride);
 
 3048     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3049     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3050     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3059     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 3060     LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 3064     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3068     dst32_r = __msa_ilvr_h(dst3, dst2);
 
 3073     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3077     dst43_r = __msa_ilvr_h(dst4, dst3);
 
 3082     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3086     dst10_r = __msa_ilvr_h(dst5, dst4);
 
 3091     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3095     dst21_r = __msa_ilvr_h(dst2, dst5);
 
 3099     PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
 
 3102     out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
 
 3104     ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
 
 3111                                            const int8_t *filter_x,
 
 3112                                            const int8_t *filter_y,
 
 3116     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3117     v16i8 src6, src7, src8, src9, src10;
 
 3119     v4i32 filt_h0, filt_h1;
 
 3120     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3122     v8i16 filter_vec, const_vec;
 
 3123     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3124     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
 
 3125     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
 
 3126     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 3127     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 3128     v8i16 out0_r, out1_r, out2_r, out3_r;
 
 3130     src -= (src_stride + 1);
 
 3132     filter_vec = 
LD_SH(filter_x);
 
 3135     filter_vec = 
LD_SH(filter_y);
 
 3136     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3137     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3143     const_vec = __msa_ldi_h(128);
 
 3146     LD_SB3(src, src_stride, src0, src1, src2);
 
 3147     src += (3 * src_stride);
 
 3151     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3152     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3153     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3162     ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
 
 3164     for (loop_cnt = height >> 3; loop_cnt--;) {
 
 3166                src3, src4, src5, src6, src7, src8, src9, src10);
 
 3167         src += (8 * src_stride);
 
 3172         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3176         dst32_r = __msa_ilvr_h(dst3, dst2);
 
 3181         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3185         dst43_r = __msa_ilvr_h(dst4, dst3);
 
 3190         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3194         dst54_r = __msa_ilvr_h(dst5, dst4);
 
 3199         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3203         dst65_r = __msa_ilvr_h(dst6, dst5);
 
 3208         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 3212         dst76_r = __msa_ilvr_h(dst7, dst6);
 
 3217         VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
 
 3221         dst87_r = __msa_ilvr_h(dst8, dst7);
 
 3226         VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
 
 3230         dst10_r = __msa_ilvr_h(dst9, dst8);
 
 3235         VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
 
 3239         dst21_r = __msa_ilvr_h(dst2, dst9);
 
 3244                     dst5_r, dst4_r, dst7_r, dst6_r,
 
 3245                     out0_r, out1_r, out2_r, out3_r);
 
 3250         PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
 
 3251         ST4x8_UB(out0_r, out1_r, dst, dst_stride);
 
 3252         dst += (8 * dst_stride);
 
 3260                                   const int8_t *filter_x,
 
 3261                                   const int8_t *filter_y,
 
 3266                                filter_x, filter_y, height);
 
 3267     } 
else if (4 == height) {
 
 3269                                filter_x, filter_y, height);
 
 3270     } 
else if (0 == (height % 8)) {
 
 3272                                        filter_x, filter_y, height);
 
 3280                                   const int8_t *filter_x,
 
 3281                                   const int8_t *filter_y,
 
 3285     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 3287     v4i32 filt_h0, filt_h1;
 
 3288     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3290     v8i16 filter_vec, const_vec;
 
 3291     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3292     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3293     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 3294     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 3295     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 3296     v8i16 out0_r, out1_r, out2_r, out3_r;
 
 3298     src -= (src_stride + 1);
 
 3300     filter_vec = 
LD_SH(filter_x);
 
 3303     filter_vec = 
LD_SH(filter_y);
 
 3304     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3305     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3311     const_vec = __msa_ldi_h(128);
 
 3314     LD_SB3(src, src_stride, src0, src1, src2);
 
 3315     src += (3 * src_stride);
 
 3319     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3320     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3321     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3333     for (loop_cnt = height >> 2; loop_cnt--;) {
 
 3334         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 3335         src += (4 * src_stride);
 
 3340         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3351         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3362         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3374         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3386                     dst2_l, dst2_r, dst3_l, dst3_r,
 
 3387                     out0_r, out1_r, out2_r, out3_r);
 
 3392         PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
 
 3393         ST6x4_UB(out0_r, out1_r, dst, dst_stride);
 
 3394         dst += (4 * dst_stride);
 
 3402                                    const int8_t *filter_x,
 
 3403                                    const int8_t *filter_y,
 
 3406     v16i8 
src0, 
src1, src2, src3, src4;
 
 3408     v4i32 filt_h0, filt_h1;
 
 3409     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3411     v8i16 filter_vec, const_vec;
 
 3412     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3413     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 3414     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 3415     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 3416     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 3417     v8i16 out0_r, out1_r;
 
 3419     src -= (src_stride + 1);
 
 3421     filter_vec = 
LD_SH(filter_x);
 
 3424     filter_vec = 
LD_SH(filter_y);
 
 3425     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3426     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3432     const_vec = __msa_ldi_h(128);
 
 3435     LD_SB3(src, src_stride, src0, src1, src2);
 
 3436     src += (3 * src_stride);
 
 3440     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3441     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3442     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3454     LD_SB2(src, src_stride, src3, src4);
 
 3458     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3469     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3479     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
 
 3482     out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
 
 3491                                    const int8_t *filter_x,
 
 3492                                    const int8_t *filter_y,
 
 3495     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 3497     v4i32 filt_h0, filt_h1;
 
 3498     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3500     v8i16 filter_vec, const_vec;
 
 3501     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3502     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 3503     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 3504     v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
 
 3505     v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
 
 3506     v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
 
 3507     v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
 
 3508     v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
 
 3509     v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
 
 3511     src -= (src_stride + 1);
 
 3513     filter_vec = 
LD_SH(filter_x);
 
 3516     filter_vec = 
LD_SH(filter_y);
 
 3517     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3518     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3524     const_vec = __msa_ldi_h(128);
 
 3527     LD_SB3(src, src_stride, src0, src1, src2);
 
 3528     src += (3 * src_stride);
 
 3532     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3533     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3534     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3546     LD_SB2(src, src_stride, src3, src4);
 
 3547     src += (2 * src_stride);
 
 3552     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3564     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3574     LD_SB2(src, src_stride, src5, src6);
 
 3575     src += (2 * src_stride);
 
 3580     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3591     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3601     LD_SB2(src, src_stride, src7, src8);
 
 3602     src += (2 * src_stride);
 
 3607     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 3619     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
 
 3630                 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
 
 3631     PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
 
 3637     PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
 
 3638     out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
 
 3640     ST8x4_UB(out0_r, out1_r, dst, dst_stride);
 
 3641     dst += (4 * dst_stride);
 
 3649                                        const int8_t *filter_x,
 
 3650                                        const int8_t *filter_y,
 
 3654     uint32_t loop_cnt, cnt;
 
 3657     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 3659     v4i32 filt_h0, filt_h1;
 
 3660     v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
 3662     v8i16 filter_vec, const_vec;
 
 3663     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 3664     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3665     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 3666     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 3667     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 3668     v8i16 out0_r, out1_r, out2_r, out3_r;
 
 3670     src -= (src_stride + 1);
 
 3672     filter_vec = 
LD_SH(filter_x);
 
 3675     filter_vec = 
LD_SH(filter_y);
 
 3676     vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
 
 3677     filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
 
 3683     const_vec = __msa_ldi_h(128);
 
 3686     for (cnt = width >> 3; cnt--;) {
 
 3690         LD_SB3(src_tmp, src_stride, src0, src1, src2);
 
 3691         src_tmp += (3 * src_stride);
 
 3695         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3696         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3697         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3709         for (loop_cnt = height >> 2; loop_cnt--;) {
 
 3710             LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
 
 3711             src_tmp += (4 * src_stride);
 
 3716             VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3728             VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3739             VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
 
 3751             VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
 
 3763                         dst2_l, dst2_r, dst3_l, dst3_r,
 
 3764                         out0_r, out1_r, out2_r, out3_r);
 
 3769             PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
 
 3770             ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
 
 3771             dst_tmp += (4 * dst_stride);
 
 3783                                   const int8_t *filter_x,
 
 3784                                   const int8_t *filter_y,
 
 3789                                filter_x, filter_y, height);
 
 3790     } 
else if (6 == height) {
 
 3792                                filter_x, filter_y, height);
 
 3793     } 
else if (0 == (height % 4)) {
 
 3795                                    filter_x, filter_y, height, 8);
 
 3803                                    const int8_t *filter_x,
 
 3804                                    const int8_t *filter_y,
 
 3808                                filter_x, filter_y, height, 8);
 
 3811                           filter_x, filter_y, height);
 
 3818                                    const int8_t *filter_x,
 
 3819                                    const int8_t *filter_y,
 
 3823                                filter_x, filter_y, height, 16);
 
 3830                                    const int8_t *filter_x,
 
 3831                                    const int8_t *filter_y,
 
 3835                                filter_x, filter_y, height, 24);
 
 3842                                    const int8_t *filter_x,
 
 3843                                    const int8_t *filter_y,
 
 3847                                filter_x, filter_y, height, 32);
 
 3850 #define UNI_MC_COPY(WIDTH)                                                 \ 
 3851 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \ 
 3852                                                     ptrdiff_t dst_stride,  \ 
 3854                                                     ptrdiff_t src_stride,  \ 
 3860     copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \ 
 3873 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \ 
 3874 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \ 
 3885     const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \ 
 3887     common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \ 
 3888                                             filter, height, 6);                \ 
 3891 UNI_MC(qpel, 
h, 4, 8, hz, mx);
 
 3892 UNI_MC(qpel, 
h, 8, 8, hz, mx);
 
 3893 UNI_MC(qpel, 
h, 12, 8, hz, mx);
 
 3894 UNI_MC(qpel, 
h, 16, 8, hz, mx);
 
 3895 UNI_MC(qpel, 
h, 24, 8, hz, mx);
 
 3896 UNI_MC(qpel, 
h, 32, 8, hz, mx);
 
 3897 UNI_MC(qpel, 
h, 48, 8, hz, mx);
 
 3898 UNI_MC(qpel, 
h, 64, 8, hz, mx);
 
 3900 UNI_MC(qpel, v, 4, 8, vt, my);
 
 3901 UNI_MC(qpel, v, 8, 8, vt, my);
 
 3902 UNI_MC(qpel, v, 12, 8, vt, my);
 
 3903 UNI_MC(qpel, v, 16, 8, vt, my);
 
 3904 UNI_MC(qpel, v, 24, 8, vt, my);
 
 3905 UNI_MC(qpel, v, 32, 8, vt, my);
 
 3906 UNI_MC(qpel, v, 48, 8, vt, my);
 
 3907 UNI_MC(qpel, v, 64, 8, vt, my);
 
 3909 UNI_MC(epel, 
h, 4, 4, hz, mx);
 
 3910 UNI_MC(epel, 
h, 6, 4, hz, mx);
 
 3911 UNI_MC(epel, 
h, 8, 4, hz, mx);
 
 3912 UNI_MC(epel, 
h, 12, 4, hz, mx);
 
 3913 UNI_MC(epel, 
h, 16, 4, hz, mx);
 
 3914 UNI_MC(epel, 
h, 24, 4, hz, mx);
 
 3915 UNI_MC(epel, 
h, 32, 4, hz, mx);
 
 3917 UNI_MC(epel, v, 4, 4, vt, my);
 
 3918 UNI_MC(epel, v, 6, 4, vt, my);
 
 3919 UNI_MC(epel, v, 8, 4, vt, my);
 
 3920 UNI_MC(epel, v, 12, 4, vt, my);
 
 3921 UNI_MC(epel, v, 16, 4, vt, my);
 
 3922 UNI_MC(epel, v, 24, 4, vt, my);
 
 3923 UNI_MC(epel, v, 32, 4, vt, my);
 
 3927 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                           \ 
 3928 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \ 
 3939     const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \ 
 3940     const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \ 
 3942     hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,     \ 
 3943                                               dst_stride, filter_x,     \ 
 3944                                               filter_y, height);        \ 
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static const uint8_t mc_filt_mask_arr[16 *3]
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define XORI_B2_128_SH(...)
#define XORI_B4_128_UB(...)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH_0_255(in)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
#define CLIP_SW_0_255(in)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH2_0_255(in0, in1)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B7_128_SB(...)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B4_128_SB(...)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define DPADD_SB2_SH(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define SPLATI_W4_SW(...)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_H4_SB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static const int8_t filt[NUMTAPS]
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define UNI_MC_COPY(WIDTH)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SPLATI_W2_SW(...)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x1_UB(in, pdst)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
#define ST4x2_UB(in, pdst, stride)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)