27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
 
   32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \ 
   35     v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m;                                 \ 
   37     ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);                              \ 
   38     ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);                              \ 
   39     DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,      \ 
   40                 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);                   \ 
   41     SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \ 
   42     PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \ 
   43     ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \ 
   44     CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);                                  \ 
   47 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \ 
   48                                        offset_h, rnd_w, out0_h, out1_h,    \ 
   51     HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \ 
   53     HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \ 
   66     uint32_t loop_cnt, tp0, tp1, tp2, tp3;
 
   70     v8i16 dst0, dst1, dst2, dst3, offset_vec;
 
   71     v4i32 weight_vec, rnd_vec;
 
   73     weight = weight & 0x0000FFFF;
 
   74     weight_vec = __msa_fill_w(weight);
 
   75     offset_vec = __msa_fill_h(offset);
 
   76     rnd_vec = __msa_fill_w(rnd_val);
 
   81         LW2(src, src_stride, tp0, tp1);
 
   83         dst0 = (v8i16) __msa_ilvr_b(zero, src0);
 
   87         DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
   89         dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
   92         out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
 
   94     } 
else if (4 == height) {
 
   95         LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  100                                        rnd_vec, dst0, dst1);
 
  101         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  102         ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
 
  103     } 
else if (0 == (height % 8)) {
 
  104         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  105             LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  106             src += 4 * src_stride;
 
  108             LW4(src, src_stride, tp0, tp1, tp2, tp3);
 
  109             src += 4 * src_stride;
 
  113             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  115                                            offset_vec, rnd_vec, dst0, dst1,
 
  118             ST4x8_UB(out0, out1, dst, dst_stride);
 
  119             dst += 8 * dst_stride;
 
  134     uint64_t tp0, tp1, tp2, tp3;
 
  136     v16u8 out0, out1, out2, out3;
 
  138     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  139     v4i32 weight_vec, rnd_vec;
 
  141     weight = weight & 0x0000FFFF;
 
  142     weight_vec = __msa_fill_w(weight);
 
  143     offset_vec = __msa_fill_h(offset);
 
  144     rnd_vec = __msa_fill_w(rnd_val);
 
  146     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  147         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  148         src += (4 * src_stride);
 
  151         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  152         src += (4 * src_stride);
 
  161         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  162         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  165                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  168                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  173         ST6x4_UB(out0, out1, dst, dst_stride);
 
  174         dst += (4 * dst_stride);
 
  175         ST6x4_UB(out2, out3, dst, dst_stride);
 
  176         dst += (4 * dst_stride);
 
  190     uint64_t tp0, tp1, tp2, tp3;
 
  191     v16i8 
src0 = { 0 }, 
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
 
  193     v16u8 out0, out1, out2, out3;
 
  194     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  195     v4i32 weight_vec, rnd_vec;
 
  197     weight = weight & 0x0000FFFF;
 
  198     weight_vec = __msa_fill_w(weight);
 
  199     offset_vec = __msa_fill_h(offset);
 
  200     rnd_vec = __msa_fill_w(rnd_val);
 
  203         LD2(src, src_stride, tp0, tp1);
 
  208                                        rnd_vec, dst0, dst1);
 
  209         out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
  211     } 
else if (4 == height) {
 
  212         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  217         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  219                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  222         ST8x4_UB(out0, out1, dst, dst_stride);
 
  223     } 
else if (6 == height) {
 
  224         LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  225         src += 4 * src_stride;
 
  228         LD2(src, src_stride, tp0, tp1);
 
  233         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  236                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  239                                        rnd_vec, dst4, dst5);
 
  240         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  241         ST8x4_UB(out0, out1, dst, dst_stride);
 
  242         dst += (4 * dst_stride);
 
  244     } 
else if (0 == height % 8) {
 
  245         for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  246             LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  247             src += 4 * src_stride;
 
  250             LD4(src, src_stride, tp0, tp1, tp2, tp3);
 
  251             src += 4 * src_stride;
 
  259             SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  260             SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  262                                            offset_vec, rnd_vec, dst0, dst1,
 
  265                                            offset_vec, rnd_vec, dst4, dst5,
 
  269             ST8x4_UB(out0, out1, dst, dst_stride);
 
  270             dst += (4 * dst_stride);
 
  271             ST8x4_UB(out2, out3, dst, dst_stride);
 
  272             dst += (4 * dst_stride);
 
  287     v16u8 out0, out1, out2;
 
  289     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  292     v4i32 weight_vec, rnd_vec;
 
  294     weight = weight & 0x0000FFFF;
 
  295     weight_vec = __msa_fill_w(weight);
 
  296     offset_vec = __msa_fill_h(offset);
 
  297     rnd_vec = __msa_fill_w(rnd_val);
 
  299     for (loop_cnt = 4; loop_cnt--;) {
 
  300         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  301         src += (4 * src_stride);
 
  302         ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
 
  303                    dst0, dst1, dst2, dst3);
 
  305         ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
 
  306         ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
 
  307         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  310                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  313                                        rnd_vec, dst4, dst5);
 
  315         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  316         ST12x4_UB(out0, out1, out2, dst, dst_stride);
 
  317         dst += (4 * dst_stride);
 
  331     v16u8 out0, out1, out2, out3;
 
  334     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  335     v4i32 weight_vec, rnd_vec;
 
  337     weight = weight & 0x0000FFFF;
 
  338     weight_vec = __msa_fill_w(weight);
 
  339     offset_vec = __msa_fill_h(offset);
 
  340     rnd_vec = __msa_fill_w(rnd_val);
 
  342     for (loop_cnt = height >> 2; loop_cnt--;) {
 
  343         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  344         src += (4 * src_stride);
 
  349         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  350         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  352                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  355                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  359         ST_UB4(out0, out1, out2, out3, dst, dst_stride);
 
  360         dst += (4 * dst_stride);
 
  374     v16u8 out0, out1, out2, out3, out4, out5;
 
  375     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  377     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  378     v8i16 dst8, dst9, dst10, dst11;
 
  379     v4i32 weight_vec, rnd_vec;
 
  381     weight = weight & 0x0000FFFF;
 
  382     weight_vec = __msa_fill_w(weight);
 
  383     offset_vec = __msa_fill_h(offset);
 
  384     rnd_vec = __msa_fill_w(rnd_val);
 
  386     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  387         LD_SB4(src, src_stride, src0, src1, src4, src5);
 
  388         LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
 
  389         src += (4 * src_stride);
 
  393         ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
 
  396         ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
 
  397         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  398         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  399         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  401                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  404                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  407                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  409         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  410         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
  411         ST_UB4(out0, out1, out3, out4, dst, dst_stride);
 
  412         ST8x4_UB(out2, out5, dst + 16, dst_stride);
 
  413         dst += (4 * dst_stride);
 
  427     v16u8 out0, out1, out2, out3;
 
  430     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  431     v4i32 weight_vec, rnd_vec;
 
  433     weight = weight & 0x0000FFFF;
 
  434     weight_vec = __msa_fill_w(weight);
 
  435     offset_vec = __msa_fill_h(offset);
 
  436     rnd_vec = __msa_fill_w(rnd_val);
 
  438     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  439         LD_SB2(src, src_stride, src0, src1);
 
  440         LD_SB2(src + 16, src_stride, src2, src3);
 
  441         src += (2 * src_stride);
 
  447         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  448         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  450                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  453                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  457         ST_UB2(out0, out1, dst, dst_stride);
 
  458         ST_UB2(out2, out3, dst + 16, dst_stride);
 
  459         dst += (2 * dst_stride);
 
  473     v16u8 out0, out1, out2, out3, out4, out5;
 
  474     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
  476     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
 
  477     v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
 
  478     v4i32 weight_vec, rnd_vec;
 
  480     weight = weight & 0x0000FFFF;
 
  481     weight_vec = __msa_fill_w(weight);
 
  482     offset_vec = __msa_fill_h(offset);
 
  483     rnd_vec = __msa_fill_w(rnd_val);
 
  485     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  486         LD_SB3(src, 16, src0, src1, src2);
 
  488         LD_SB3(src, 16, src3, src4, src5);
 
  497         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  498         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  499         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  501                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  504                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  507                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  509         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  510         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
  511         ST_UB2(out0, out1, dst, 16);
 
  512         ST_UB(out2, dst + 32);
 
  514         ST_UB2(out3, out4, dst, 16);
 
  515         ST_UB(out5, dst + 32);
 
  530     v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
 
  531     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  533     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
 
  534     v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
  535     v4i32 weight_vec, rnd_vec;
 
  537     weight = weight & 0x0000FFFF;
 
  538     weight_vec = __msa_fill_w(weight);
 
  539     offset_vec = __msa_fill_h(offset);
 
  540     rnd_vec = __msa_fill_w(rnd_val);
 
  542     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  543         LD_SB4(src, 16, src0, src1, src2, src3);
 
  545         LD_SB4(src, 16, src4, src5, src6, src7);
 
  556         SLLI_4V(dst0, dst1, dst2, dst3, 6);
 
  557         SLLI_4V(dst4, dst5, dst6, dst7, 6);
 
  558         SLLI_4V(dst8, dst9, dst10, dst11, 6);
 
  559         SLLI_4V(dst12, dst13, dst14, dst15, 6);
 
  561                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  564                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
  567                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
  570                                        offset_vec, rnd_vec, dst12, dst13, dst14,
 
  575         PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
 
  576         ST_UB4(out0, out1, out2, out3, dst, 16);
 
  578         ST_UB4(out4, out5, out6, out7, dst, 16);
 
  595     v8i16 filt0, filt1, filt2, filt3;
 
  596     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  597     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
  598     v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
 
  599     v8i16 filter_vec, dst01, dst23, dst45, dst67;
 
  600     v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
 
  601     v4i32 weight_vec, rnd_vec;
 
  604     weight = weight & 0x0000FFFF;
 
  606     weight_vec = __msa_fill_w(weight);
 
  607     rnd_vec = __msa_fill_w(rnd_val);
 
  612     weight_vec_h = __msa_fill_h(weight);
 
  613     offset_vec = __msa_fill_h(offset);
 
  614     denom_vec = __msa_fill_h(rnd_val);
 
  616     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  617     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  619     filter_vec = 
LD_SH(filter);
 
  620     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  627     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
  628         LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
  629         src += (8 * src_stride);
 
  632         VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
 
  633                    vec0, vec1, vec2, vec3);
 
  634         VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
 
  635                    vec4, vec5, vec6, vec7);
 
  636         VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
 
  637                    vec8, vec9, vec10, vec11);
 
  638         VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
 
  639                    vec12, vec13, vec14, vec15);
 
  650                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  654         ST4x8_UB(out0, out1, dst, dst_stride);
 
  655         dst += (8 * dst_stride);
 
  672     v8i16 filt0, filt1, filt2, filt3;
 
  673     v16i8 mask0, mask1, mask2, mask3;
 
  675     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  676     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  677     v8i16 dst0, dst1, dst2, dst3;
 
  678     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  679     v4i32 weight_vec, rnd_vec;
 
  682     weight = weight & 0x0000FFFF;
 
  684     weight_vec = __msa_fill_w(weight);
 
  685     rnd_vec = __msa_fill_w(rnd_val);
 
  690     weight_vec_h = __msa_fill_h(weight);
 
  691     offset_vec = __msa_fill_h(offset);
 
  692     denom_vec = __msa_fill_h(rnd_val);
 
  694     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  695     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  697     filter_vec = 
LD_SH(filter);
 
  698     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  705     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  706         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  707         src += (4 * src_stride);
 
  710         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  711                    vec0, vec1, vec2, vec3);
 
  712         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  713                    vec4, vec5, vec6, vec7);
 
  714         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  715                    vec8, vec9, vec10, vec11);
 
  716         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  717                    vec12, vec13, vec14, vec15);
 
  728                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  732         ST8x4_UB(out0, out1, dst, dst_stride);
 
  733         dst += (4 * dst_stride);
 
  748     v16u8 out0, out1, out2;
 
  749     v8i16 filt0, filt1, filt2, filt3;
 
  750     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
  751     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
  752     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  753     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  755     v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
 
  756     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  757     v4i32 weight_vec, rnd_vec;
 
  760     weight = weight & 0x0000FFFF;
 
  762     weight_vec = __msa_fill_w(weight);
 
  763     rnd_vec = __msa_fill_w(rnd_val);
 
  768     weight_vec_h = __msa_fill_h(weight);
 
  769     offset_vec = __msa_fill_h(offset);
 
  770     denom_vec = __msa_fill_h(rnd_val);
 
  772     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  773     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  775     filter_vec = 
LD_SH(filter);
 
  776     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  787     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
  788         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
  789         LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
 
  790         src += (4 * src_stride);
 
  793         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  794                    vec0, vec1, vec2, vec3);
 
  795         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  796                    vec4, vec5, vec6, vec7);
 
  797         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  798                    vec8, vec9, vec10, vec11);
 
  799         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  800                    vec12, vec13, vec14, vec15);
 
  809         VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
 
  810                    vec0, vec1, vec2, vec3);
 
  811         VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
 
  812                    vec4, vec5, vec6, vec7);
 
  819                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  822                                        rnd_vec, dst4, dst5);
 
  824         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
  825         ST8x4_UB(out0, out1, dst, dst_stride);
 
  826         ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
 
  827         dst += (4 * dst_stride);
 
  844     v8i16 filt0, filt1, filt2, filt3;
 
  845     v16i8 mask0, mask1, mask2, mask3;
 
  847     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  848     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  849     v8i16 dst0, dst1, dst2, dst3;
 
  850     v8i16 weight_vec_h, offset_vec, denom_vec;
 
  851     v4i32 weight_vec, rnd_vec;
 
  855     weight_vec = __msa_fill_w(weight);
 
  856     rnd_vec = __msa_fill_w(rnd_val);
 
  861     weight_vec_h = __msa_fill_h(weight);
 
  862     offset_vec = __msa_fill_h(offset);
 
  863     denom_vec = __msa_fill_h(rnd_val);
 
  865     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  866     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  868     filter_vec = 
LD_SH(filter);
 
  869     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  876     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
  877         LD_SB2(src, src_stride, src0, src2);
 
  878         LD_SB2(src + 8, src_stride, src1, src3);
 
  879         src += (2 * src_stride);
 
  882         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  883                    vec0, vec1, vec2, vec3);
 
  884         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  885                    vec4, vec5, vec6, vec7);
 
  886         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  887                    vec8, vec9, vec10, vec11);
 
  888         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  889                    vec12, vec13, vec14, vec15);
 
  900                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  904         ST_UB2(out0, out1, dst, dst_stride);
 
  905         dst += (2 * dst_stride);
 
  920     v16u8 out0, out1, out2;
 
  922     v8i16 filt0, filt1, filt2, filt3;
 
  923     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
  924     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  925     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
  926     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
  927     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
  928     v4i32 weight_vec, rnd_vec;
 
  932     weight_vec = __msa_fill_w(weight);
 
  933     rnd_vec = __msa_fill_w(rnd_val);
 
  938     weight_vec_h = __msa_fill_h(weight);
 
  939     offset_vec = __msa_fill_h(offset);
 
  940     denom_vec = __msa_fill_h(rnd_val);
 
  942     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
  943     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
  945     filter_vec = 
LD_SH(filter);
 
  946     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
  957     for (loop_cnt = 16; loop_cnt--;) {
 
  958         LD_SB2(src, 16, src0, src1);
 
  960         LD_SB2(src, 16, src2, src3);
 
  963         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
  964                    vec0, vec1, vec2, vec3);
 
  965         VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
  966                    vec4, vec5, vec6, vec7);
 
  967         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
  968                    vec8, vec9, vec10, vec11);
 
  969         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
  970                    vec12, vec13, vec14, vec15);
 
  980         VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
 
  981                    vec0, vec1, vec2, vec3);
 
  982         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
  983                    vec4, vec5, vec6, vec7);
 
  990                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
  993                                        rnd_vec, dst4, dst5);
 
  995         PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
 
  996         ST_UB2(out0, out1, dst, dst_stride);
 
  997         ST8x2_UB(out2, dst + 16, dst_stride);
 
  998         dst += (2 * dst_stride);
 
 1013     v16u8 out0, out1, out2, out3;
 
 1014     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 1015     v8i16 filt0, filt1, filt2, filt3;
 
 1016     v16i8 mask0, mask1, mask2, mask3;
 
 1017     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1018     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1020     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1021     v8i16 weight_vec_h, offset_vec, denom_vec;
 
 1022     v4i32 weight_vec, rnd_vec;
 
 1026     weight_vec = __msa_fill_w(weight);
 
 1027     rnd_vec = __msa_fill_w(rnd_val);
 
 1032     weight_vec_h = __msa_fill_h(weight);
 
 1033     offset_vec = __msa_fill_h(offset);
 
 1034     denom_vec = __msa_fill_h(rnd_val);
 
 1036     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1037     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1039     filter_vec = 
LD_SH(filter);
 
 1040     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1047     for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1048         LD_SB4(src, 8, src0, src1, src2, src3);
 
 1050         LD_SB4(src, 8, src4, src5, src6, src7);
 
 1054         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1055                    vec0, vec1, vec2, vec3);
 
 1056         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1057                    vec4, vec5, vec6, vec7);
 
 1058         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1059                    vec8, vec9, vec10, vec11);
 
 1060         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1061                    vec12, vec13, vec14, vec15);
 
 1071         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1072                    vec0, vec1, vec2, vec3);
 
 1073         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 1074                    vec4, vec5, vec6, vec7);
 
 1075         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 1076                    vec8, vec9, vec10, vec11);
 
 1077         VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 1078                    vec12, vec13, vec14, vec15);
 
 1089                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1092                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 1097         ST_UB2(out0, out1, dst, 16);
 
 1099         ST_UB2(out2, out3, dst, 16);
 
 1115     v16u8 out0, out1, out2;
 
 1117     v8i16 filt0, filt1, filt2, filt3;
 
 1118     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1119     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1120     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1121     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 1122     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 1123     v4i32 weight_vec, rnd_vec;
 
 1127     weight = weight & 0x0000FFFF;
 
 1128     weight_vec = __msa_fill_w(weight);
 
 1129     rnd_vec = __msa_fill_w(rnd_val);
 
 1134     weight_vec_h = __msa_fill_h(weight);
 
 1135     offset_vec = __msa_fill_h(offset);
 
 1136     denom_vec = __msa_fill_h(rnd_val);
 
 1138     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1139     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1141     filter_vec = 
LD_SH(filter);
 
 1142     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1153     for (loop_cnt = 64; loop_cnt--;) {
 
 1154         LD_SB3(src, 16, src0, src1, src2);
 
 1155         src3 = 
LD_SB(src + 40);
 
 1159         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1160                    vec0, vec1, vec2, vec3);
 
 1161         VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
 1162                    vec4, vec5, vec6, vec7);
 
 1163         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1164                    vec8, vec9, vec10, vec11);
 
 1165         VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
 
 1166                    vec12, vec13, vec14, vec15);
 
 1176         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1177                    vec0, vec1, vec2, vec3);
 
 1178         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1179                    vec4, vec5, vec6, vec7);
 
 1186                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1189                                        rnd_vec, dst4, dst5);
 
 1191         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 1192         ST_UB2(out0, out1, dst, 16);
 
 1193         ST_UB(out2, dst + 32);
 
 1210     uint32_t loop_cnt, cnt;
 
 1213     v8i16 filt0, filt1, filt2, filt3;
 
 1214     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 1215     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1216     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1217     v8i16 dst0, dst1, dst2, dst3;
 
 1218     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 1219     v4i32 weight_vec, rnd_vec;
 
 1223     weight_vec = __msa_fill_w(weight);
 
 1224     rnd_vec = __msa_fill_w(rnd_val);
 
 1229     weight_vec_h = __msa_fill_h(weight);
 
 1230     offset_vec = __msa_fill_h(offset);
 
 1231     denom_vec = __msa_fill_h(rnd_val);
 
 1233     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1234     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1236     filter_vec = 
LD_SH(filter);
 
 1237     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1248     for (loop_cnt = height; loop_cnt--;) {
 
 1252         for (cnt = 2; cnt--;) {
 
 1253             LD_SB2(src_tmp, 16, src0, src1);
 
 1254             src2 = 
LD_SB(src_tmp + 24);
 
 1258             VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1259                        vec0, vec1, vec2, vec3);
 
 1260             VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
 
 1261                        vec4, vec5, vec6, vec7);
 
 1262             VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1263                        vec8, vec9, vec10, vec11);
 
 1264             VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1265                        vec12, vec13, vec14, vec15);
 
 1276                                            offset_vec, rnd_vec, dst0, dst1,
 
 1280             ST_UB2(out0, out1, dst_tmp, 16);
 
 1301     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1302     v16i8 src9, src10, src11, src12, src13, src14;
 
 1303     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1304     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1305     v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
 
 1306     v16i8 src2110, src4332, src6554, src8776, src10998;
 
 1307     v16i8 src12111110, src14131312;
 
 1308     v8i16 filter_vec, dst01, dst23, dst45, dst67;
 
 1309     v8i16 filt0, filt1, filt2, filt3;
 
 1310     v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
 
 1311     v4i32 weight_vec, rnd_vec;
 
 1313     src -= (3 * src_stride);
 
 1316     weight_vec = __msa_fill_w(weight);
 
 1317     rnd_vec = __msa_fill_w(rnd_val);
 
 1322     weight_vec_h = __msa_fill_h(weight);
 
 1323     offset_vec = __msa_fill_h(offset);
 
 1324     denom_vec = __msa_fill_h(rnd_val);
 
 1326     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1327     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1329     filter_vec = 
LD_SH(filter);
 
 1330     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1332     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1333     src += (7 * src_stride);
 
 1335     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1336                src10_r, src32_r, src54_r, src21_r);
 
 1338     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1341                src32_r, src65_r, src54_r, src2110, src4332, src6554);
 
 1345     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 1347                src7, src8, src9, src10, src11, src12, src13, src14);
 
 1348         src += (8 * src_stride);
 
 1349         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1350                    src76_r, src87_r, src98_r, src109_r);
 
 1351         ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
 
 1352                    src1110_r, src1211_r, src1312_r, src1413_r);
 
 1353         ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
 
 1354                    src1413_r, src1312_r,
 
 1355                    src8776, src10998, src12111110, src14131312);
 
 1358                                   filt1, filt2, filt3);
 
 1360                                   filt1, filt2, filt3);
 
 1362                                   filt0, filt1, filt2, filt3);
 
 1364                                   filt0, filt1, filt2, filt3);
 
 1367                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1371         ST4x8_UB(out0, out1, dst, dst_stride);
 
 1372         dst += (8 * dst_stride);
 
 1375         src4332 = src12111110;
 
 1376         src6554 = src14131312;
 
 1393     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1394     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1395     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1396     v8i16 filt0, filt1, filt2, filt3;
 
 1398     v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
 
 1399     v4i32 weight_vec, rnd_vec;
 
 1401     src -= (3 * src_stride);
 
 1403     weight_vec = __msa_fill_w(weight);
 
 1404     rnd_vec = __msa_fill_w(rnd_val);
 
 1409     weight_vec_h = __msa_fill_h(weight);
 
 1410     offset_vec = __msa_fill_h(offset);
 
 1411     denom_vec = __msa_fill_h(rnd_val);
 
 1413     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1414     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1416     filter_vec = 
LD_SH(filter);
 
 1417     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1419     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1420     src += (7 * src_stride);
 
 1423     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1424                src10_r, src32_r, src54_r, src21_r);
 
 1425     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1427     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1428         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1429         src += (4 * src_stride);
 
 1431         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1432                    src76_r, src87_r, src98_r, src109_r);
 
 1434                                  filt1, filt2, filt3);
 
 1436                                  filt1, filt2, filt3);
 
 1438                                  filt1, filt2, filt3);
 
 1440                                  filt1, filt2, filt3);
 
 1443                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1447         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1448         dst += (4 * dst_stride);
 
 1471     v16u8 out0, out1, out2;
 
 1472     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1473     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 1474     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 1475     v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
 
 1476     v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
 
 1477     v16i8 src2110, src4332, src6554, src8776, src10998;
 
 1478     v8i16 filt0, filt1, filt2, filt3;
 
 1479     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 1480     v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
 
 1481     v4i32 weight_vec, rnd_vec;
 
 1483     src -= (3 * src_stride);
 
 1485     weight = weight & 0x0000FFFF;
 
 1486     weight_vec = __msa_fill_w(weight);
 
 1487     rnd_vec = __msa_fill_w(rnd_val);
 
 1492     weight_vec_h = __msa_fill_h(weight);
 
 1493     offset_vec = __msa_fill_h(offset);
 
 1494     denom_vec = __msa_fill_h(rnd_val);
 
 1496     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1497     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1499     filter_vec = 
LD_SH(filter);
 
 1500     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1502     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1503     src += (7 * src_stride);
 
 1506     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1507                src10_r, src32_r, src54_r, src21_r);
 
 1508     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1509     ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1510                src10_l, src32_l, src54_l, src21_l);
 
 1511     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1512     ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
 
 1513                src2110, src4332, src6554);
 
 1515     for (loop_cnt = 4; loop_cnt--;) {
 
 1516         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1517         src += (4 * src_stride);
 
 1520         ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1521                    src76_r, src87_r, src98_r, src109_r);
 
 1522         ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1523                    src76_l, src87_l, src98_l, src109_l);
 
 1524         ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
 
 1527                                  filt1, filt2, filt3);
 
 1529                                  filt1, filt2, filt3);
 
 1531                                  filt1, filt2, filt3);
 
 1533                                  filt1, filt2, filt3);
 
 1535                                  filt1, filt2, filt3);
 
 1537                                  filt1, filt2, filt3);
 
 1540                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 1543                                        rnd_vec, dst4, dst5);
 
 1545         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 1546         ST8x4_UB(out0, out1, dst, dst_stride);
 
 1547         ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
 
 1548         dst += (4 * dst_stride);
 
 1577     v16u8 out0, out1, out2, out3;
 
 1578     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1579     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 1580     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 1581     v16i8 src10_l, src32_l, src54_l, src76_l;
 
 1582     v16i8 src21_l, src43_l, src65_l, src87_l;
 
 1583     v16i8 src98_r, src109_r, src98_l, src109_l;
 
 1584     v8i16 filt0, filt1, filt2, filt3;
 
 1586     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 1587     v8i16 weight_vec_h, offset_vec, denom_vec;
 
 1588     v4i32 weight_vec, rnd_vec;
 
 1590     src -= (3 * src_stride);
 
 1592     weight_vec = __msa_fill_w(weight);
 
 1593     rnd_vec = __msa_fill_w(rnd_val);
 
 1598     weight_vec_h = __msa_fill_h(weight);
 
 1599     offset_vec = __msa_fill_h(offset);
 
 1600     denom_vec = __msa_fill_h(rnd_val);
 
 1602     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 1603     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 1605     filter_vec = 
LD_SH(filter);
 
 1606     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1608     for (cnt = weightmul16; cnt--;) {
 
 1612         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1613         src_tmp += (7 * src_stride);
 
 1616         for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 1617             LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
 
 1618             src_tmp += (4 * src_stride);
 
 1621             ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1622                        src10_r, src32_r, src54_r, src21_r);
 
 1623             ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
 1624             ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
 
 1625                        src10_l, src32_l, src54_l, src21_l);
 
 1626             ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
 1627             ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1628                        src76_r, src87_r, src98_r, src109_r);
 
 1629             ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
 
 1630                        src76_l, src87_l, src98_l, src109_l);
 
 1633                                      filt1, filt2, filt3);
 
 1635                                      filt1, filt2, filt3);
 
 1637                                      filt1, filt2, filt3);
 
 1639                                      filt1, filt2, filt3);
 
 1641                                      filt1, filt2, filt3);
 
 1643                                      filt1, filt2, filt3);
 
 1645                                      filt1, filt2, filt3);
 
 1647                                      filt1, filt2, filt3);
 
 1650                                            offset_vec, rnd_vec, dst0, dst1,
 
 1653                                            offset_vec, rnd_vec, dst4, dst5,
 
 1657             ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
 
 1658             dst_tmp += (4 * dst_stride);
 
 1685                                        filter, height, weight,
 
 1686                                        offset, rnd_val, 1);
 
 1701                                        offset, rnd_val, 1);
 
 1704                              filter, 32, weight, offset, rnd_val);
 
 1718                                        filter, height, weight,
 
 1719                                        offset, rnd_val, 2);
 
 1734                                        offset, rnd_val, 3);
 
 1748                                        filter, height, weight,
 
 1749                                        offset, rnd_val, 4);
 
 1756                                      const int8_t *filter_x,
 
 1757                                      const int8_t *filter_y,
 
 1765     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 1766     v8i16 filt0, filt1, filt2, filt3;
 
 1767     v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1768     v16i8 mask1, mask2, mask3;
 
 1770     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1771     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1772     v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
 
 1773     v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
 
 1774     v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
 
 1775     v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
 
 1776     v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
 
 1779     src -= ((3 * src_stride) + 3);
 
 1780     filter_vec = 
LD_SH(filter_x);
 
 1781     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1783     filter_vec = 
LD_SH(filter_y);
 
 1786     SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1792     weight_vec = __msa_fill_w(weight);
 
 1793     offset_vec = __msa_fill_w(offset);
 
 1794     rnd_vec = __msa_fill_w(rnd_val);
 
 1795     denom_vec = rnd_vec - 6;
 
 1797     const_128 = __msa_ldi_w(128);
 
 1798     const_128 *= weight_vec;
 
 1799     offset_vec += __msa_srar_w(const_128, denom_vec);
 
 1801     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1802     src += (7 * src_stride);
 
 1806     VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 1807     VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 1808     VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
 
 1809                vec8, vec9, vec10, vec11);
 
 1810     VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
 
 1811                vec12, vec13, vec14, vec15);
 
 1825     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 1827     for (loop_cnt = height >> 2; loop_cnt--;) {
 
 1828         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 1829         src += (4 * src_stride);
 
 1832         VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
 
 1833                    vec0, vec1, vec2, vec3);
 
 1834         VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
 
 1835                    vec4, vec5, vec6, vec7);
 
 1841         dst76_r = __msa_ilvr_h(dst97, dst66);
 
 1843         dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
 
 1844         dst98_r = __msa_ilvr_h(dst66, dst108);
 
 1846         dst0_r = 
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
 
 1847                                 filt_h1, filt_h2, filt_h3);
 
 1848         dst1_r = 
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
 
 1849                                 filt_h1, filt_h2, filt_h3);
 
 1850         dst2_r = 
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
 
 1851                                 filt_h1, filt_h2, filt_h3);
 
 1852         dst3_r = 
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
 
 1853                                 filt_h1, filt_h2, filt_h3);
 
 1855         SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
 
 1856         MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 1857         MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 1858         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
 
 1859         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
 
 1860         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
 
 1862         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
 
 1863         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
 
 1864         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 1865         dst += (4 * dst_stride);
 
 1873         dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
 
 1881                                               const int8_t *filter_x,
 
 1882                                               const int8_t *filter_y,
 
 1889     uint32_t loop_cnt, cnt;
 
 1892     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 1893     v8i16 filt0, filt1, filt2, filt3;
 
 1894     v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
 
 1895     v16i8 mask1, mask2, mask3;
 
 1897     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1898     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 1899     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 1900     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 1901     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 1902     v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
 
 1903     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 1904     v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
 
 1905     v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
 
 1908     src -= ((3 * src_stride) + 3);
 
 1910     weight_vec = __msa_fill_w(weight);
 
 1911     offset_vec = __msa_fill_w(offset);
 
 1912     rnd_vec = __msa_fill_w(rnd_val);
 
 1913     denom_vec = rnd_vec - 6;
 
 1915     const_128 = __msa_ldi_w(128);
 
 1916     const_128 *= weight_vec;
 
 1917     offset_vec += __msa_srar_w(const_128, denom_vec);
 
 1919     filter_vec = 
LD_SH(filter_x);
 
 1920     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 1922     filter_vec = 
LD_SH(filter_y);
 
 1924     SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 1930     for (cnt = width >> 3; cnt--;) {
 
 1934         LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 1935         src_tmp += (7 * src_stride);
 
 1938         VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
 
 1939                    vec0, vec1, vec2, vec3);
 
 1940         VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
 
 1941                    vec4, vec5, vec6, vec7);
 
 1942         VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
 
 1943                    vec8, vec9, vec10, vec11);
 
 1944         VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
 
 1945                    vec12, vec13, vec14, vec15);
 
 1955         VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
 
 1956                    vec0, vec1, vec2, vec3);
 
 1957         VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
 
 1958                    vec4, vec5, vec6, vec7);
 
 1959         VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
 
 1960                    vec8, vec9, vec10, vec11);
 
 1968         ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 1969                    dst10_r, dst32_r, dst54_r, dst21_r);
 
 1970         ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
 
 1971         ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
 
 1972                    dst10_l, dst32_l, dst54_l, dst21_l);
 
 1973         ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
 
 1975         for (loop_cnt = height >> 1; loop_cnt--;) {
 
 1976             LD_SB2(src_tmp, src_stride, src7, src8);
 
 1977             src_tmp += 2 * src_stride;
 
 1980             VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
 
 1981                        vec0, vec1, vec2, vec3);
 
 1987                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1989                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 1994             VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
 
 1995                        vec0, vec1, vec2, vec3);
 
 2001                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2003                                     filt_h0, filt_h1, filt_h2, filt_h3);
 
 2007             MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
 
 2008             MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
 
 2009             SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
 
 2010             ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
 
 2011             ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
 
 2014             PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
 
 2015             dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
 
 2016             ST8x2_UB(dst0_r, dst_tmp, dst_stride);
 
 2017             dst_tmp += (2 * dst_stride);
 
 2043                                      const int8_t *filter_x,
 
 2044                                      const int8_t *filter_y,
 
 2051                                       filter_x, filter_y, height, weight,
 
 2052                                       offset, rnd_val, 8);
 
 2059                                       const int8_t *filter_x,
 
 2060                                       const int8_t *filter_y,
 
 2069     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 2070     v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
 
 2071     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2072     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 2073     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2074     v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
 
 2075     v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
 
 2076     v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
 
 2077     v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
 
 2078     v8i16 dst76_l, filter_vec;
 
 2079     v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
 
 2080     v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
 
 2082     src -= ((3 * src_stride) + 3);
 
 2084     filter_vec = 
LD_SH(filter_x);
 
 2085     SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 2087     filter_vec = 
LD_SH(filter_y);
 
 2090     SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 2092     weight_vec = __msa_fill_w(weight);
 
 2093     offset_vec = __msa_fill_w(offset);
 
 2094     rnd_vec = __msa_fill_w(rnd_val);
 
 2095     denom_vec = rnd_vec - 6;
 
 2097     const_128 = __msa_ldi_w(128);
 
 2098     const_128 *= weight_vec;
 
 2099     offset_vec += __msa_srar_w(const_128, denom_vec);
 
 2109     LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 2110     src_tmp += (7 * src_stride);
 
 2114     VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 2115     VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 2116     VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
 
 2118     VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
 
 2128     VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
 
 2129     VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
 
 2130     VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
 
 2139     for (loop_cnt = 16; loop_cnt--;) {
 
 2140         src7 = 
LD_SB(src_tmp);
 
 2141         src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
 
 2142         src_tmp += src_stride;
 
 2144         VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
 
 2154                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 2156                                 filt_h0, filt_h1, filt_h2, filt_h3);
 
 2160         MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
 
 2162         ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
 
 2164         dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
 2165         out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
 
 2167         dst_tmp += dst_stride;
 
 2186     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 2187     src += (7 * src_stride);
 
 2190     VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
 
 2191     VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
 
 2192     VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
 
 2194     VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
 
 2208     dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
 
 2210     for (loop_cnt = 4; loop_cnt--;) {
 
 2211         LD_SB4(src, src_stride, src7, src8, src9, src10);
 
 2212         src += (4 * src_stride);
 
 2215         VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
 
 2217         VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
 
 2224         dst76_r = __msa_ilvr_h(dst97, dst66);
 
 2226         dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
 
 2227         dst98_r = __msa_ilvr_h(dst66, dst108);
 
 2229         dst0_r = 
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
 
 2230                                 filt_h1, filt_h2, filt_h3);
 
 2231         dst1_r = 
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
 
 2232                                 filt_h1, filt_h2, filt_h3);
 
 2233         dst2_r = 
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
 
 2234                                 filt_h1, filt_h2, filt_h3);
 
 2235         dst3_r = 
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
 
 2236                                 filt_h1, filt_h2, filt_h3);
 
 2238         SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
 
 2239         MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 2240         MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 2241         SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
 
 2242         ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
 
 2243         ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
 
 2245         PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
 
 2246         out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
 
 2247         ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 2248         dst += (4 * dst_stride);
 
 2256         dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
 
 2264                                       const int8_t *filter_x,
 
 2265                                       const int8_t *filter_y,
 
 2272                                       filter_x, filter_y, height, weight,
 
 2273                                       offset, rnd_val, 16);
 
 2280                                       const int8_t *filter_x,
 
 2281                                       const int8_t *filter_y,
 
 2288                                       filter_x, filter_y, height, weight,
 
 2289                                       offset, rnd_val, 24);
 
 2296                                       const int8_t *filter_x,
 
 2297                                       const int8_t *filter_y,
 
 2304                                       filter_x, filter_y, height, weight,
 
 2305                                       offset, rnd_val, 32);
 
 2312                                       const int8_t *filter_x,
 
 2313                                       const int8_t *filter_y,
 
 2320                                       filter_x, filter_y, height, weight,
 
 2321                                       offset, rnd_val, 48);
 
 2328                                       const int8_t *filter_x,
 
 2329                                       const int8_t *filter_y,
 
 2336                                       filter_x, filter_y, height, weight,
 
 2337                                       offset, rnd_val, 64);
 
 2354     v4i32 dst0_r, dst0_l;
 
 2355     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2356     v4i32 weight_vec, rnd_vec;
 
 2361     filter_vec = 
LD_SH(filter);
 
 2366     weight = weight & 0x0000FFFF;
 
 2368     weight_vec = __msa_fill_w(weight);
 
 2369     rnd_vec = __msa_fill_w(rnd_val);
 
 2374     weight_vec_h = __msa_fill_h(weight);
 
 2375     offset_vec = __msa_fill_h(offset);
 
 2376     denom_vec = __msa_fill_h(rnd_val);
 
 2378     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2379     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2381     LD_SB2(src, src_stride, src0, src1);
 
 2384     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2388     DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
 2390     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
 2391     dst0 = __msa_adds_s_h(dst0, offset_vec);
 
 2393     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
 
 2395     dst += (4 * dst_stride);
 
 2410     v16i8 mask1, vec0, vec1, vec2, vec3;
 
 2412     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2413     v4i32 weight_vec, rnd_vec;
 
 2419     filter_vec = 
LD_SH(filter);
 
 2424     weight = weight & 0x0000FFFF;
 
 2426     weight_vec = __msa_fill_w(weight);
 
 2427     rnd_vec = __msa_fill_w(rnd_val);
 
 2432     weight_vec_h = __msa_fill_h(weight);
 
 2433     offset_vec = __msa_fill_h(offset);
 
 2434     denom_vec = __msa_fill_h(rnd_val);
 
 2436     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2437     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2439     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2442     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2443     VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
 
 2450     out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 2451     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 2452     dst += (4 * dst_stride);
 
 2468     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2469     v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2470     v8i16 dst0, dst1, dst2, dst3;
 
 2472     v8i16 weight_vec_h, offset_vec, denom_vec;
 
 2473     v4i32 weight_vec, rnd_vec;
 
 2478     filter_vec = 
LD_SH(filter);
 
 2481     weight = weight & 0x0000FFFF;
 
 2483     weight_vec = __msa_fill_w(weight);
 
 2484     rnd_vec = __msa_fill_w(rnd_val);
 
 2489     weight_vec_h = __msa_fill_h(weight);
 
 2490     offset_vec = __msa_fill_h(offset);
 
 2491     denom_vec = __msa_fill_h(rnd_val);
 
 2493     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2494     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2498     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 2499         LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2500         src += (8 * src_stride);
 
 2504         VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 2505         VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
 
 2506         VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
 
 2507         VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
 
 2514                                        weight_vec, offset_vec, rnd_vec,
 
 2515                                        dst0, dst1, dst2, dst3);
 
 2518         ST4x8_UB(out0, out1, dst, dst_stride);
 
 2519         dst += (8 * dst_stride);
 
 2535                                   filter, weight, offset, rnd_val);
 
 2536     } 
else if (4 == height) {
 
 2538                                   filter, weight, offset, rnd_val);
 
 2539     } 
else if (8 == height || 16 == height) {
 
 2541                                           filter, height, weight,
 
 2556     v16u8 out0, out1, out2, out3;
 
 2558     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2561     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2562     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2563     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2564     v4i32 weight_vec, rnd_vec;
 
 2568     filter_vec = 
LD_SH(filter);
 
 2571     weight = weight & 0x0000FFFF;
 
 2573     weight_vec = __msa_fill_w(weight);
 
 2574     rnd_vec = __msa_fill_w(rnd_val);
 
 2579     weight_vec_h = __msa_fill_h(weight);
 
 2580     offset_vec = __msa_fill_h(offset);
 
 2581     denom_vec = __msa_fill_h(rnd_val);
 
 2583     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2584     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2588     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2590     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2591     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2592     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2593     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 2598     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 2599     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
 
 2600     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
 
 2601     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
 
 2608                                    weight_vec, offset_vec, rnd_vec,
 
 2609                                    dst0, dst1, dst2, dst3);
 
 2611                                    weight_vec, offset_vec, rnd_vec,
 
 2612                                    dst4, dst5, dst6, dst7);
 
 2616     ST6x4_UB(out0, out1, dst, dst_stride);
 
 2617     dst += (4 * dst_stride);
 
 2618     ST6x4_UB(out2, out3, dst, dst_stride);
 
 2631     v8i16 filt0, filt1, dst0, dst1;
 
 2635     v16i8 vec0, vec1, vec2, vec3;
 
 2636     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2637     v4i32 weight_vec, rnd_vec;
 
 2641     filter_vec = 
LD_SH(filter);
 
 2644     weight = weight & 0x0000FFFF;
 
 2646     weight_vec = __msa_fill_w(weight);
 
 2647     rnd_vec = __msa_fill_w(rnd_val);
 
 2652     weight_vec_h = __msa_fill_h(weight);
 
 2653     offset_vec = __msa_fill_h(offset);
 
 2654     denom_vec = __msa_fill_h(rnd_val);
 
 2656     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2657     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2661     LD_SB2(src, src_stride, src0, src1);
 
 2664     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2665     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2672     out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 2687     v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2688     v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
 
 2689     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2690     v4i32 weight_vec, rnd_vec;
 
 2694     filter_vec = 
LD_SH(filter);
 
 2697     weight = weight & 0x0000FFFF;
 
 2698     weight_vec = __msa_fill_w(weight);
 
 2699     rnd_vec = __msa_fill_w(rnd_val);
 
 2704     weight_vec_h = __msa_fill_h(weight);
 
 2705     offset_vec = __msa_fill_h(offset);
 
 2706     denom_vec = __msa_fill_h(rnd_val);
 
 2708     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2709     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2714     LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2716     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2717     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2718     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2719     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 2726                                    weight_vec, offset_vec, rnd_vec,
 
 2727                                    dst0, dst1, dst2, dst3);
 
 2730     ST8x4_UB(out0, out1, dst, dst_stride);
 
 2742     v16u8 out0, out1, out2;
 
 2744     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 2748     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 2749     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 2750     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2751     v4i32 weight_vec, rnd_vec;
 
 2755     filter_vec = 
LD_SH(filter);
 
 2758     weight = weight & 0x0000FFFF;
 
 2760     weight_vec = __msa_fill_w(weight);
 
 2761     rnd_vec = __msa_fill_w(rnd_val);
 
 2766     weight_vec_h = __msa_fill_h(weight);
 
 2767     offset_vec = __msa_fill_h(offset);
 
 2768     denom_vec = __msa_fill_h(rnd_val);
 
 2770     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2771     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2775     LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
 
 2778     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2779     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2780     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2781     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 2782     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
 
 2783     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
 
 2792                                    weight_vec, offset_vec, rnd_vec,
 
 2793                                    dst0, dst1, dst2, dst3);
 
 2798     PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 2799     ST8x4_UB(out0, out1, dst, dst_stride);
 
 2800     dst += (4 * dst_stride);
 
 2816     v16u8 out0, out1, out2, out3;
 
 2817     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 2820     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2821     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2822     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2823     v4i32 weight_vec, rnd_vec;
 
 2827     filter_vec = 
LD_SH(filter);
 
 2830     weight = weight & 0x0000FFFF;
 
 2832     weight_vec = __msa_fill_w(weight);
 
 2833     rnd_vec = __msa_fill_w(rnd_val);
 
 2838     weight_vec_h = __msa_fill_h(weight);
 
 2839     offset_vec = __msa_fill_h(offset);
 
 2840     denom_vec = __msa_fill_h(rnd_val);
 
 2842     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2843     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2847     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 2848         LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 
 2849         src += (8 * src_stride);
 
 2852         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2853         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2854         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2855         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 2860         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 2861         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
 
 2862         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
 
 2863         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
 
 2870                                        weight_vec, offset_vec, rnd_vec,
 
 2871                                        dst0, dst1, dst2, dst3);
 
 2874                                        weight_vec, offset_vec, rnd_vec,
 
 2875                                        dst4, dst5, dst6, dst7);
 
 2879         ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
 
 2880         dst += (8 * dst_stride);
 
 2896                                   filter, weight, offset, rnd_val);
 
 2897     } 
else if (4 == height) {
 
 2899                                   filter, weight, offset, rnd_val);
 
 2900     } 
else if (6 == height) {
 
 2902                                   filter, weight, offset, rnd_val);
 
 2905                                           filter, height, weight, offset,
 
 2921     v16u8 out0, out1, out2;
 
 2925     v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
 
 2928     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
 
 2929     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 2930     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 2932     v4i32 weight_vec, rnd_vec;
 
 2936     filter_vec = 
LD_SH(filter);
 
 2939     weight = weight & 0x0000FFFF;
 
 2941     weight_vec = __msa_fill_w(weight);
 
 2942     rnd_vec = __msa_fill_w(rnd_val);
 
 2947     weight_vec_h = __msa_fill_h(weight);
 
 2948     offset_vec = __msa_fill_h(offset);
 
 2949     denom_vec = __msa_fill_h(rnd_val);
 
 2951     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 2952     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 2957     for (loop_cnt = 4; loop_cnt--;) {
 
 2958         LD_SB4(src, src_stride, src0, src1, src2, src3);
 
 2959         src += (4 * src_stride);
 
 2963         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 2964         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 2965         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 2966         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 2967         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
 
 2968         VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
 
 2977                                        weight_vec, offset_vec, rnd_vec,
 
 2978                                        dst0, dst1, dst2, dst3);
 
 2981                                        rnd_vec, dst4, dst5);
 
 2983         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 2984         ST12x4_UB(out0, out1, out2, dst, dst_stride);
 
 2985         dst += (4 * dst_stride);
 
 3000     v16u8 out0, out1, out2, out3;
 
 3001     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7;
 
 3005     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3006     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3007     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3008     v4i32 weight_vec, rnd_vec;
 
 3012     filter_vec = 
LD_SH(filter);
 
 3015     weight = weight & 0x0000FFFF;
 
 3017     weight_vec = __msa_fill_w(weight);
 
 3018     rnd_vec = __msa_fill_w(rnd_val);
 
 3023     weight_vec_h = __msa_fill_h(weight);
 
 3024     offset_vec = __msa_fill_h(offset);
 
 3025     denom_vec = __msa_fill_h(rnd_val);
 
 3027     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3028     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3032     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3033         LD_SB4(src, src_stride, src0, src2, src4, src6);
 
 3034         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
 
 3035         src += (4 * src_stride);
 
 3039         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3040         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 3041         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3042         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 3047         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
 
 3048         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
 
 3049         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
 
 3050         VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
 
 3057                                        weight_vec, offset_vec, rnd_vec,
 
 3058                                        dst0, dst1, dst2, dst3);
 
 3061                                        weight_vec, offset_vec, rnd_vec,
 
 3062                                        dst4, dst5, dst6, dst7);
 
 3064         PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
 
 3065                     out0, out1, out2, out3);
 
 3067         ST_UB4(out0, out1, out2, out3, dst, dst_stride);
 
 3068         dst += (4 * dst_stride);
 
 3083     v16u8 out0, out1, out2;
 
 3086     v16i8 mask0, mask1, mask2, mask3;
 
 3087     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3088     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3089     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3090     v4i32 weight_vec, rnd_vec;
 
 3094     filter_vec = 
LD_SH(filter);
 
 3097     weight = weight & 0x0000FFFF;
 
 3098     weight_vec = __msa_fill_w(weight);
 
 3099     rnd_vec = __msa_fill_w(rnd_val);
 
 3104     weight_vec_h = __msa_fill_h(weight);
 
 3105     offset_vec = __msa_fill_h(offset);
 
 3106     denom_vec = __msa_fill_h(rnd_val);
 
 3108     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3109     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3116     for (loop_cnt = 16; loop_cnt--;) {
 
 3117         LD_SB2(src, src_stride, src0, src2);
 
 3118         LD_SB2(src + 16, src_stride, src1, src3);
 
 3119         src += (2 * src_stride);
 
 3123         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3124         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
 
 3125         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 3126         VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
 
 3131         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
 
 3132         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
 
 3137                                        weight_vec, offset_vec, rnd_vec,
 
 3138                                        dst0, dst1, dst2, dst3);
 
 3141                                        rnd_vec, dst4, dst5);
 
 3143         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 3144         ST_UB2(out0, out1, dst, dst_stride);
 
 3145         ST8x2_UB(out2, dst + 16, dst_stride);
 
 3146         dst += (2 * dst_stride);
 
 3161     v16u8 out0, out1, out2, out3;
 
 3162     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3165     v16i8 mask1, mask2, mask3;
 
 3166     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 3167     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3168     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3169     v4i32 weight_vec, rnd_vec;
 
 3173     filter_vec = 
LD_SH(filter);
 
 3176     weight = weight & 0x0000FFFF;
 
 3178     weight_vec = __msa_fill_w(weight);
 
 3179     rnd_vec = __msa_fill_w(rnd_val);
 
 3184     weight_vec_h = __msa_fill_h(weight);
 
 3185     offset_vec = __msa_fill_h(offset);
 
 3186     denom_vec = __msa_fill_h(rnd_val);
 
 3188     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3189     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3195     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 3196         LD_SB2(src, 16, src0, src1);
 
 3197         src2 = 
LD_SB(src + 24);
 
 3199         LD_SB2(src, 16, src3, src4);
 
 3200         src5 = 
LD_SB(src + 24);
 
 3203         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 3204         VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
 
 3205         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
 
 3206         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
 
 3211         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 3212         VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
 
 3213         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
 
 3214         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
 
 3221                                        weight_vec, offset_vec, rnd_vec,
 
 3222                                        dst0, dst1, dst2, dst3);
 
 3225                                        weight_vec, offset_vec, rnd_vec,
 
 3226                                        dst4, dst5, dst6, dst7);
 
 3230         ST_UB2(out0, out1, dst, 16);
 
 3232         ST_UB2(out2, out3, dst, 16);
 
 3247     v16i8 
src0, 
src1, src2, src3, src4;
 
 3248     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3249     v16i8 src2110, src4332;
 
 3251     v4i32 dst0_r, dst0_l;
 
 3253     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3254     v4i32 weight_vec, rnd_vec;
 
 3258     weight = weight & 0x0000FFFF;
 
 3260     weight_vec = __msa_fill_w(weight);
 
 3261     rnd_vec = __msa_fill_w(rnd_val);
 
 3266     weight_vec_h = __msa_fill_h(weight);
 
 3267     offset_vec = __msa_fill_h(offset);
 
 3268     denom_vec = __msa_fill_h(rnd_val);
 
 3270     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3271     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3273     filter_vec = 
LD_SH(filter);
 
 3276     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 3277     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3278     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3279     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
 
 3283     DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
 
 3285     dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
 
 3286     dst0 = __msa_adds_s_h(dst0, offset_vec);
 
 3288     out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
 
 3302     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 3303     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
 
 3304     v16i8 src2110, src4332, src6554;
 
 3307     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3308     v4i32 weight_vec, rnd_vec;
 
 3312     weight = weight & 0x0000FFFF;
 
 3314     weight_vec = __msa_fill_w(weight);
 
 3315     rnd_vec = __msa_fill_w(rnd_val);
 
 3320     weight_vec_h = __msa_fill_h(weight);
 
 3321     offset_vec = __msa_fill_h(offset);
 
 3322     denom_vec = __msa_fill_h(rnd_val);
 
 3324     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3325     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3327     filter_vec = 
LD_SH(filter);
 
 3330     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 3331     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3332     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 3333                src32_r, src43_r, src54_r, src65_r);
 
 3334     ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 
 3335                src2110, src4332, src6554);
 
 3342     out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 3343     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 3358     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 3359     v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
 
 3360     v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
 
 3361     v16i8 src2110, src4332, src6554, src8776;
 
 3363     v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
 
 3364     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3365     v4i32 weight_vec, rnd_vec;
 
 3369     weight = weight & 0x0000FFFF;
 
 3371     weight_vec = __msa_fill_w(weight);
 
 3372     rnd_vec = __msa_fill_w(rnd_val);
 
 3377     weight_vec_h = __msa_fill_h(weight);
 
 3378     offset_vec = __msa_fill_h(offset);
 
 3379     denom_vec = __msa_fill_h(rnd_val);
 
 3381     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3382     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3384     filter_vec = 
LD_SH(filter);
 
 3387     LD_SB3(src, src_stride, src0, src1, src2);
 
 3388     src += (3 * src_stride);
 
 3389     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3390     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
 
 3391     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
 
 3393     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 3395                src3, src4, src5, src6, src7, src8, src9, src10);
 
 3396         src += (8 * src_stride);
 
 3397         ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
 
 3398                    src32_r, src43_r, src54_r, src65_r);
 
 3399         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3400         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 3401         ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
 
 3402                    src109_r, src98_r, src4332, src6554, src8776, src10998);
 
 3410                                        weight_vec, offset_vec, rnd_vec,
 
 3411                                        dst0, dst1, dst2, dst3);
 
 3414         ST4x8_UB(out0, out1, dst, dst_stride);
 
 3415         dst += (8 * dst_stride);
 
 3434                                   filter, weight, offset, rnd_val);
 
 3435     } 
else if (4 == height) {
 
 3437                                   filter, weight, offset, rnd_val);
 
 3438     } 
else if (0 == (height % 8)) {
 
 3440                                           filter, height, weight, offset,
 
 3455     v16u8 out0, out1, out2, out3;
 
 3456     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 3457     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3458     v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
 
 3460     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3461     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3462     v4i32 weight_vec, rnd_vec;
 
 3466     weight = weight & 0x0000FFFF;
 
 3468     weight_vec = __msa_fill_w(weight);
 
 3469     rnd_vec = __msa_fill_w(rnd_val);
 
 3474     weight_vec_h = __msa_fill_h(weight);
 
 3475     offset_vec = __msa_fill_h(offset);
 
 3476     denom_vec = __msa_fill_h(rnd_val);
 
 3478     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3479     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3481     filter_vec = 
LD_SH(filter);
 
 3484     LD_SB3(src, src_stride, src0, src1, src2);
 
 3485     src += (3 * src_stride);
 
 3486     LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
 
 3489     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3490     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3491     ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
 
 3492     ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3493     ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 3504                                    weight_vec, offset_vec, rnd_vec,
 
 3505                                    dst0, dst1, dst2, dst3);
 
 3507                                    weight_vec, offset_vec, rnd_vec,
 
 3508                                    dst4, dst5, dst6, dst7);
 
 3512     ST6x4_UB(out0, out1, dst, dst_stride);
 
 3513     dst += (4 * dst_stride);
 
 3514     ST6x4_UB(out2, out3, dst, dst_stride);
 
 3527     v16i8 
src0, 
src1, src2, src3, src4;
 
 3528     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3531     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3532     v4i32 weight_vec, rnd_vec;
 
 3536     weight = weight & 0x0000FFFF;
 
 3538     weight_vec = __msa_fill_w(weight);
 
 3539     rnd_vec = __msa_fill_w(rnd_val);
 
 3544     weight_vec_h = __msa_fill_h(weight);
 
 3545     offset_vec = __msa_fill_h(offset);
 
 3546     denom_vec = __msa_fill_h(rnd_val);
 
 3548     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3549     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3551     filter_vec = 
LD_SH(filter);
 
 3554     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 3556     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3557     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3564     out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
 
 3578     v16i8 
src0, 
src1, src2, src3, src4;
 
 3579     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3580     v16i8 src5, src6, src54_r, src65_r;
 
 3582     v8i16 dst0, dst1, dst2, dst3;
 
 3583     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3584     v4i32 weight_vec, rnd_vec;
 
 3588     weight = weight & 0x0000FFFF;
 
 3590     weight_vec = __msa_fill_w(weight);
 
 3591     rnd_vec = __msa_fill_w(rnd_val);
 
 3596     weight_vec_h = __msa_fill_h(weight);
 
 3597     offset_vec = __msa_fill_h(offset);
 
 3598     denom_vec = __msa_fill_h(rnd_val);
 
 3600     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3601     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3603     filter_vec = 
LD_SH(filter);
 
 3606     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 3607     src += (3 * src_stride);
 
 3609     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3610     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3611     ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
 
 3617                                    offset_vec, rnd_vec, dst0, dst1, dst2,
 
 3620     ST8x4_UB(out0, out1, dst, dst_stride);
 
 3632     v16u8 out0, out1, out2;
 
 3633     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 3634     v16i8 src10_r, src32_r, src54_r, src76_r;
 
 3635     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 3636     v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
 
 3638     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3639     v4i32 weight_vec, rnd_vec;
 
 3643     weight = weight & 0x0000FFFF;
 
 3645     weight_vec = __msa_fill_w(weight);
 
 3646     rnd_vec = __msa_fill_w(rnd_val);
 
 3651     weight_vec_h = __msa_fill_h(weight);
 
 3652     offset_vec = __msa_fill_h(offset);
 
 3653     denom_vec = __msa_fill_h(rnd_val);
 
 3655     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3656     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3658     filter_vec = 
LD_SH(filter);
 
 3661     LD_SB3(src, src_stride, src0, src1, src2);
 
 3662     src += (3 * src_stride);
 
 3663     LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
 
 3667     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
 
 3669     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 
 3678                                    offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
 
 3681     PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 3682     ST8x4_UB(out0, out1, dst, dst_stride);
 
 3683     dst += (4 * dst_stride);
 
 3698     v16u8 out0, out1, out2, out3;
 
 3699     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 3700     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3701     v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
 
 3703     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3704     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3705     v4i32 weight_vec, rnd_vec;
 
 3709     weight = weight & 0x0000FFFF;
 
 3711     weight_vec = __msa_fill_w(weight);
 
 3712     rnd_vec = __msa_fill_w(rnd_val);
 
 3717     weight_vec_h = __msa_fill_h(weight);
 
 3718     offset_vec = __msa_fill_h(offset);
 
 3719     denom_vec = __msa_fill_h(rnd_val);
 
 3721     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3722     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3724     filter_vec = 
LD_SH(filter);
 
 3727     LD_SB3(src, src_stride, src0, src1, src2);
 
 3728     src += (3 * src_stride);
 
 3730     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3732     for (loop_cnt = (height >> 3); loop_cnt--;) {
 
 3734                src3, src4, src5, src6, src7, src8, src9, src10);
 
 3735         src += (8 * src_stride);
 
 3737         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 3738         ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
 
 3739         ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
 
 3740         ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
 
 3750                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 3753                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 3757         ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
 
 3758         dst += (8 * dst_stride);
 
 3778                                   filter, weight, offset, rnd_val);
 
 3779     } 
else if (4 == height) {
 
 3781                                   filter, weight, offset, rnd_val);
 
 3782     } 
else if (6 == height) {
 
 3784                                   filter, weight, offset, rnd_val);
 
 3787                                       filter, height, weight, offset,
 
 3803     v16u8 out0, out1, out2, out3, out4, out5;
 
 3804     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 3805     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3806     v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
 
 3807     v16i8 src2110, src4332;
 
 3808     v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
 
 3809     v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
 
 3811     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 3812     v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3813     v4i32 weight_vec, rnd_vec;
 
 3815     src -= (1 * src_stride);
 
 3817     weight = weight & 0x0000FFFF;
 
 3819     weight_vec = __msa_fill_w(weight);
 
 3820     rnd_vec = __msa_fill_w(rnd_val);
 
 3825     weight_vec_h = __msa_fill_h(weight);
 
 3826     offset_vec = __msa_fill_h(offset);
 
 3827     denom_vec = __msa_fill_h(rnd_val);
 
 3829     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3830     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3832     filter_vec = 
LD_SH(filter);
 
 3835     LD_SB3(src, src_stride, src0, src1, src2);
 
 3836     src += (3 * src_stride);
 
 3838     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3839     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3840     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
 
 3842     for (loop_cnt = 2; loop_cnt--;) {
 
 3843         LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
 
 3844         src += (8 * src_stride);
 
 3850         src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
 
 3851         src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
 
 3859                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 3862                                        rnd_vec, dst4, dst5);
 
 3863         PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
 
 3864         ST12x4_UB(out0, out1, out2, dst, dst_stride);
 
 3865         dst += (4 * dst_stride);
 
 3871         src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
 
 3872         src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
 
 3880                                        offset_vec, rnd_vec, dst6, dst7, dst8,
 
 3883                                        rnd_vec, dst10, dst11);
 
 3884         PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
 
 3885         ST12x4_UB(out3, out4, out5, dst, dst_stride);
 
 3886         dst += (4 * dst_stride);
 
 3906     v16u8 out0, out1, out2, out3;
 
 3907     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3908     v16i8 src10_r, src32_r, src21_r, src43_r;
 
 3909     v16i8 src10_l, src32_l, src21_l, src43_l;
 
 3910     v16i8 src54_r, src54_l, src65_r, src65_l, src6;
 
 3912     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 3913     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 3914     v4i32 weight_vec, rnd_vec;
 
 3918     weight = weight & 0x0000FFFF;
 
 3920     weight_vec = __msa_fill_w(weight);
 
 3921     rnd_vec = __msa_fill_w(rnd_val);
 
 3926     weight_vec_h = __msa_fill_h(weight);
 
 3927     offset_vec = __msa_fill_h(offset);
 
 3928     denom_vec = __msa_fill_h(rnd_val);
 
 3930     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 3931     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 3933     filter_vec = 
LD_SH(filter);
 
 3936     LD_SB3(src, src_stride, src0, src1, src2);
 
 3937     src += (3 * src_stride);
 
 3939     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 3940     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 3942     for (loop_cnt = (height >> 2); loop_cnt--;) {
 
 3943         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 3944         src += (4 * src_stride);
 
 3959                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 3962                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 3964         PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
 
 3966         ST_UB4(out0, out1, out2, out3, dst, dst_stride);
 
 3967         dst += (4 * dst_stride);
 
 3988     v16u8 out0, out1, out2, out3, out4, out5;
 
 3989     v16i8 
src0, 
src1, src2, src3, src4, src5;
 
 3990     v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
 
 3991     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
 
 3992     v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
 
 3993     v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
 
 3995     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
 
 3996     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
 
 3997     v4i32 weight_vec, rnd_vec;
 
 4001     weight = weight & 0x0000FFFF;
 
 4003     weight_vec = __msa_fill_w(weight);
 
 4004     rnd_vec = __msa_fill_w(rnd_val);
 
 4009     weight_vec_h = __msa_fill_h(weight);
 
 4010     offset_vec = __msa_fill_h(offset);
 
 4011     denom_vec = __msa_fill_h(rnd_val);
 
 4013     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 4014     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 4016     filter_vec = 
LD_SH(filter);
 
 4019     LD_SB3(src, src_stride, src0, src1, src2);
 
 4020     LD_SB3(src + 16, src_stride, src7, src8, src9);
 
 4021     src += (3 * src_stride);
 
 4024     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 4025     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 4026     ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
 
 4028     for (loop_cnt = 8; loop_cnt--;) {
 
 4029         LD_SB4(src, src_stride, src3, src4, src5, src6);
 
 4030         LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
 
 4031         src += (4 * src_stride);
 
 4034         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 4035         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 4038         ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
 
 4039         ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
 
 4053                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 4056                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 4059                                        offset_vec, rnd_vec, dst8, dst9, dst10,
 
 4061         PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
 
 4063         PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
 
 4064         ST_UB4(out0, out1, out2, out3, dst, dst_stride);
 
 4065         ST8x4_UB(out4, out5, dst + 16, dst_stride);
 
 4066         dst += (4 * dst_stride);
 
 4074         src87_r = src1211_r;
 
 4075         src98_r = src1312_r;
 
 4090     v16u8 out0, out1, out2, out3;
 
 4091     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9;
 
 4092     v16i8 src10_r, src32_r, src76_r, src98_r;
 
 4093     v16i8 src21_r, src43_r, src65_r, src87_r;
 
 4094     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 4095     v16i8 src10_l, src32_l, src76_l, src98_l;
 
 4096     v16i8 src21_l, src43_l, src65_l, src87_l;
 
 4098     v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
 
 4099     v4i32 weight_vec, rnd_vec;
 
 4103     weight = weight & 0x0000FFFF;
 
 4105     weight_vec = __msa_fill_w(weight);
 
 4106     rnd_vec = __msa_fill_w(rnd_val);
 
 4111     weight_vec_h = __msa_fill_h(weight);
 
 4112     offset_vec = __msa_fill_h(offset);
 
 4113     denom_vec = __msa_fill_h(rnd_val);
 
 4115     weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
 
 4116     offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
 
 4118     filter_vec = 
LD_SH(filter);
 
 4121     LD_SB3(src, src_stride, src0, src1, src2);
 
 4122     LD_SB3(src + 16, src_stride, src5, src6, src7);
 
 4123     src += (3 * src_stride);
 
 4125     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
 
 4126     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
 
 4127     ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
 
 4128     ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
 
 4130     for (loop_cnt = (height >> 1); loop_cnt--;) {
 
 4131         LD_SB2(src, src_stride, src3, src4);
 
 4132         LD_SB2(src + 16, src_stride, src8, src9);
 
 4133         src += (2 * src_stride);
 
 4135         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
 
 4136         ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
 
 4148                                        offset_vec, rnd_vec, dst0, dst1, dst2,
 
 4151                                        offset_vec, rnd_vec, dst4, dst5, dst6,
 
 4153         PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
 
 4155         ST_UB2(out0, out2, dst, 16);
 
 4157         ST_UB2(out1, out3, dst, 16);
 
 4177                                       const int8_t *filter_x,
 
 4178                                       const int8_t *filter_y,
 
 4184     v16i8 
src0, 
src1, src2, src3, src4;
 
 4188     v8i16 filt_h0, filt_h1, filter_vec, 
tmp;
 
 4189     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 
 4190     v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
 
 4191     v8i16 offset_vec, const_128, denom_vec;
 
 4192     v4i32 dst0, dst1, weight_vec, rnd_vec;
 
 4194     src -= (src_stride + 1);
 
 4196     filter_vec = 
LD_SH(filter_x);
 
 4199     filter_vec = 
LD_SH(filter_y);
 
 4206     weight_vec = __msa_fill_w(weight);
 
 4207     rnd_vec = __msa_fill_w(rnd_val);
 
 4209     offset_vec = __msa_fill_h(offset);
 
 4210     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4211     const_128 = __msa_fill_h((128 * weight));
 
 4212     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4214     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 4216     VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
 
 4217     VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
 
 4218     VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
 
 4228     MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
 
 4230     tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
 
 4233     out = (v16u8) __msa_pckev_b((v16i8) 
tmp, (v16i8) tmp);
 
 4241                                       const int8_t *filter_x,
 
 4242                                       const int8_t *filter_y,
 
 4248     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 4250     v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
 
 4253     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4254     v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
 
 4255     v8i16 offset_vec, const_128, denom_vec;
 
 4256     v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
 
 4258     src -= (src_stride + 1);
 
 4260     filter_vec = 
LD_SH(filter_x);
 
 4263     filter_vec = 
LD_SH(filter_y);
 
 4270     weight_vec = __msa_fill_w(weight);
 
 4271     rnd_vec = __msa_fill_w(rnd_val);
 
 4273     offset_vec = __msa_fill_h(offset);
 
 4274     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4275     const_128 = __msa_fill_h((128 * weight));
 
 4276     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4278     LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 4280     VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
 
 4281     VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
 
 4282     VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
 
 4283     VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
 
 4295     SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 4296     MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
 
 4297     MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
 
 4300     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4302     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 4303     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 
 4310                                               const int8_t *filter_x,
 
 4311                                               const int8_t *filter_y,
 
 4319     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4323     v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
 
 4324     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4325     v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
 
 4326     v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
 
 4327     v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
 
 4328     v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
 
 4329     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
 
 4331     src -= (src_stride + 1);
 
 4333     filter_vec = 
LD_SH(filter_x);
 
 4336     filter_vec = 
LD_SH(filter_y);
 
 4343     weight_vec = __msa_fill_w(weight);
 
 4344     rnd_vec = __msa_fill_w(rnd_val);
 
 4346     offset_vec = __msa_fill_h(offset);
 
 4347     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4348     const_128 = __msa_fill_h((128 * weight));
 
 4349     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4351     LD_SB3(src, src_stride, src0, src1, src2);
 
 4352     src += (3 * src_stride);
 
 4355     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
 
 4356     VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
 
 4360     dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
 
 4362     for (loop_cnt = height >> 3; loop_cnt--;) {
 
 4364                src3, src4, src5, src6, src7, src8, src9, src10);
 
 4365         src += (8 * src_stride);
 
 4368         VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
 
 4369         VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
 
 4370         VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
 
 4371         VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
 
 4376         dst32_r = __msa_ilvr_h(dst73, dst22);
 
 4380         dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
 
 4381         dst76_r = __msa_ilvr_h(dst22, dst106);
 
 4390         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 4391         SRA_4V(dst4, dst5, dst6, dst7, 6);
 
 4392         MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
 
 4393         MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
 
 4394         MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
 
 4395         MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
 
 4398         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
 
 4400         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4401         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 4404         ST4x8_UB(out0, out1, dst, dst_stride);
 
 4405         dst += (8 * dst_stride);
 
 4409         dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
 
 4417                                      const int8_t *filter_x,
 
 4418                                      const int8_t *filter_y,
 
 4426                                   filter_x, filter_y, weight,
 
 4428     } 
else if (4 == height) {
 
 4430                                   filter_x,filter_y, weight,
 
 4432     } 
else if (0 == (height % 8)) {
 
 4434                                           filter_x, filter_y, height, weight,
 
 4443                                      const int8_t *filter_x,
 
 4444                                      const int8_t *filter_y,
 
 4450     v16u8 out0, out1, out2;
 
 4451     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 4455     v8i16 filt_h0, filt_h1, filter_vec;
 
 4456     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4457     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
 
 4458     v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 4459     v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
 
 4460     v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
 
 4461     v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
 
 4462     v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
 
 4463     v8i16 offset_vec, const_128, denom_vec;
 
 4464     v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
 
 4465     v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
 
 4467     src -= (src_stride + 1);
 
 4469     filter_vec = 
LD_SH(filter_x);
 
 4472     filter_vec = 
LD_SH(filter_y);
 
 4479     weight_vec = __msa_fill_w(weight);
 
 4480     rnd_vec = __msa_fill_w(rnd_val);
 
 4482     offset_vec = __msa_fill_h(offset);
 
 4483     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4484     const_128 = __msa_fill_h((128 * weight));
 
 4485     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4487     LD_SB3(src, src_stride, src0, src1, src2);
 
 4488     src += (3 * src_stride);
 
 4491     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4492     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4493     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4500     LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
 
 4502     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4503     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 4504     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 4505     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 4510     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
 
 4511     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
 
 4512     VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
 
 4513     VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
 
 4526     PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
 
 4527     PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
 
 4528     dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
 
 4540     dst3_l = 
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
 
 4541     SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
 
 4542     SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
 
 4543     SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
 
 4544     MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4545     MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 4546     MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
 
 4547     MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
 
 4548     MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 4549     MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
 
 4550     SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
 
 4551     SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
 
 4552     SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
 
 4553     PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
 
 4554     PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
 
 4555     PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
 
 4556     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4557     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 4558     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
 
 4561     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
 
 4562     ST4x8_UB(out0, out1, dst, dst_stride);
 
 4563     ST2x4_UB(out2, 0, dst + 4, dst_stride);
 
 4564     dst += 4 * dst_stride;
 
 4565     ST2x4_UB(out2, 4, dst + 4, dst_stride);
 
 4572                                       const int8_t *filter_x,
 
 4573                                       const int8_t *filter_y,
 
 4579     v16i8 
src0, 
src1, src2, src3, src4;
 
 4581     v8i16 filt_h0, filt_h1, filter_vec;
 
 4584     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
 
 4585     v8i16 dst0, dst1, dst2, dst3, dst4;
 
 4586     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 4587     v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
 
 4588     v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
 
 4590     v8i16 offset_vec, const_128, denom_vec;
 
 4591     v4i32 weight_vec, rnd_vec;
 
 4593     src -= (src_stride + 1);
 
 4595     filter_vec = 
LD_SH(filter_x);
 
 4598     filter_vec = 
LD_SH(filter_y);
 
 4605     weight_vec = __msa_fill_w(weight);
 
 4606     rnd_vec = __msa_fill_w(rnd_val);
 
 4608     offset_vec = __msa_fill_h(offset);
 
 4609     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4610     const_128 = __msa_fill_h((128 * weight));
 
 4611     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4613     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 4615     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4616     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4617     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4618     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 4619     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
 
 4633     SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 4634     MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4635     MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 4636     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 4637     PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
 
 4638     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4640     out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
 4648                                           const int8_t *filter_x,
 
 4649                                           const int8_t *filter_y,
 
 4657     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, mask0, mask1;
 
 4658     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4659     v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
 
 4660     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
 
 4661     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 4662     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
 
 4663     v8i16 offset_vec, const_128, denom_vec;
 
 4664     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 4665     v4i32 weight_vec, rnd_vec;
 
 4667     src -= (src_stride + 1);
 
 4669     filter_vec = 
LD_SH(filter_x);
 
 4672     filter_vec = 
LD_SH(filter_y);
 
 4680     weight_vec = __msa_fill_w(weight);
 
 4681     rnd_vec = __msa_fill_w(rnd_val);
 
 4683     offset_vec = __msa_fill_h(offset);
 
 4684     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4685     const_128 = __msa_fill_h((128 * weight));
 
 4686     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4688     for (cnt = width8mult; cnt--;) {
 
 4689         LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 
 4692         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4693         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4694         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4700         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4701         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 4702         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 4703         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 4720         SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 4721         SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 4722         MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4723         MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 4724         MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 4725         MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
 
 4726         SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 4727         SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
 
 4728         PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 4729                     dst3_r, tmp0, tmp1, tmp2, tmp3);
 
 4730         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4731         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 4734         ST8x4_UB(out0, out1, dst, dst_stride);
 
 4743                                       const int8_t *filter_x,
 
 4744                                       const int8_t *filter_y,
 
 4749     v16u8 out0, out1, out2;
 
 4750     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8;
 
 4752     v8i16 filt_h0, filt_h1, filter_vec;
 
 4755     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
 
 4756     v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
 
 4757     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 
 4758     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 4759     v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
 
 4760     v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
 
 4761     v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
 
 4762     v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
 
 4763     v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
 
 4764     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 4765     v8i16 offset_vec, const_128, denom_vec;
 
 4767     src -= (src_stride + 1);
 
 4769     filter_vec = 
LD_SH(filter_x);
 
 4772     filter_vec = 
LD_SH(filter_y);
 
 4779     weight_vec = __msa_fill_w(weight);
 
 4780     rnd_vec = __msa_fill_w(rnd_val);
 
 4782     offset_vec = __msa_fill_h(offset);
 
 4783     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4784     const_128 = __msa_fill_h((128 * weight));
 
 4785     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4787     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
 4788     src += (5 * src_stride);
 
 4789     LD_SB4(src, src_stride, src5, src6, src7, src8);
 
 4792     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4793     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4794     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4795     VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
 
 4796     VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
 
 4797     VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
 
 4798     VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
 
 4799     VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
 
 4800     VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
 
 4830     SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 4831     SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 4832     SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
 
 4833     MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4834     MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 4835     MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
 
 4836     MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 4837     MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
 
 4838     MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
 
 4839     SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 4840     SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
 
 4841     SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
 
 4842     PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
 
 4843                 tmp0, tmp1, tmp2, tmp3);
 
 4844     PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
 
 4845     ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4846     ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 4847     ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
 
 4850     PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
 
 4851     ST8x4_UB(out0, out1, dst, dst_stride);
 
 4852     dst += (4 * dst_stride);
 
 4860                                               const int8_t *filter_x,
 
 4861                                               const int8_t *filter_y,
 
 4868     uint32_t loop_cnt, cnt;
 
 4872     v16i8 
src0, 
src1, src2, src3, src4, src5, src6;
 
 4874     v8i16 filt_h0, filt_h1, filter_vec;
 
 4877     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 4878     v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
 
 4879     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 4880     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
 
 4881     v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
 
 4882     v8i16 offset_vec, const_128, denom_vec;
 
 4883     v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
 
 4884     v4i32 weight_vec, rnd_vec;
 
 4886     src -= (src_stride + 1);
 
 4888     filter_vec = 
LD_SH(filter_x);
 
 4891     filter_vec = 
LD_SH(filter_y);
 
 4898     weight_vec = __msa_fill_w(weight);
 
 4899     rnd_vec = __msa_fill_w(rnd_val);
 
 4901     offset_vec = __msa_fill_h(offset);
 
 4902     denom_vec = __msa_fill_h(rnd_val - 6);
 
 4903     const_128 = __msa_fill_h((128 * weight));
 
 4904     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 4906     for (cnt = width8mult; cnt--;) {
 
 4910         LD_SB3(src_tmp, src_stride, src0, src1, src2);
 
 4911         src_tmp += (3 * src_stride);
 
 4914         VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 4915         VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 4916         VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 4924         for (loop_cnt = height >> 2; loop_cnt--;) {
 
 4925             LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
 
 4926             src_tmp += (4 * src_stride);
 
 4929             VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 4930             VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 4931             VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 4932             VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 4949             SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 4950             SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 4951             MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 4952             MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 4953             MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 4954             MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
 
 4955             SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 4956             SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
 
 4957             PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 4958                         dst3_r, tmp0, tmp1, tmp2, tmp3);
 
 4959             ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 4960             ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 4963             ST8x4_UB(out0, out1, dst_tmp, dst_stride);
 
 4964             dst_tmp += (4 * dst_stride);
 
 4982                                      const int8_t *filter_x,
 
 4983                                      const int8_t *filter_y,
 
 4992                                   filter_x, filter_y, weight,
 
 4994     } 
else if (4 == height) {
 
 4996                                       filter_x, filter_y, 1, weight,
 
 4998     } 
else if (6 == height) {
 
 5000                                   filter_x, filter_y, weight,
 
 5002     } 
else if (0 == (height % 4)) {
 
 5004                                           filter_x, filter_y, height, weight,
 
 5005                                           offset, rnd_val, 1);
 
 5013                                       const int8_t *filter_x,
 
 5014                                       const int8_t *filter_y,
 
 5023     v16i8 
src0, 
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 
 5024     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 5025     v16i8 mask0, mask1, mask2, mask3;
 
 5026     v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
 
 5027     v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
 
 5028     v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
 
 5029     v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
 
 5030     v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
 
 5031     v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
 
 5032     v8i16 offset_vec, const_128, denom_vec;
 
 5033     v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
 
 5034     v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
 
 5036     src -= (src_stride + 1);
 
 5038     filter_vec = 
LD_SH(filter_x);
 
 5041     filter_vec = 
LD_SH(filter_y);
 
 5049     weight_vec = __msa_fill_w(weight);
 
 5050     rnd_vec = __msa_fill_w(rnd_val);
 
 5052     offset_vec = __msa_fill_h(offset);
 
 5053     denom_vec = __msa_fill_h(rnd_val - 6);
 
 5054     const_128 = __msa_fill_h((128 * weight));
 
 5055     offset_vec += __msa_srar_h(const_128, denom_vec);
 
 5060     LD_SB3(src_tmp, src_stride, src0, src1, src2);
 
 5061     src_tmp += (3 * src_stride);
 
 5063     VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
 
 5064     VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
 
 5065     VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
 
 5072     for (loop_cnt = 4; loop_cnt--;) {
 
 5073         LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
 
 5074         src_tmp += (4 * src_stride);
 
 5076         VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
 
 5077         VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
 
 5078         VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
 
 5079         VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
 
 5096         SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
 
 5097         SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
 
 5098         MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
 
 5099         MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
 
 5100         MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
 
 5101         MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
 
 5102         SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
 
 5103         SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
 
 5104         PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
 
 5105                     dst3_r, tmp0, tmp1, tmp2, tmp3);
 
 5106         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 5107         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 5110         ST8x4_UB(out0, out1, dst_tmp, dst_stride);
 
 5111         dst_tmp += (4 * dst_stride);
 
 5126     LD_SB3(src, src_stride, src0, src1, src2);
 
 5127     src += (3 * src_stride);
 
 5129     VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
 
 5130     VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
 
 5134     dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
 
 5136     for (loop_cnt = 2; loop_cnt--;) {
 
 5137         LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
 
 5139         src += (8 * src_stride);
 
 5141         VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
 
 5142         VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
 
 5143         VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
 
 5144         VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
 
 5149         dst32_r = __msa_ilvr_h(dst73, dst22);
 
 5153         dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
 
 5154         dst76_r = __msa_ilvr_h(dst22, dst106);
 
 5163         SRA_4V(dst0, dst1, dst2, dst3, 6);
 
 5164         SRA_4V(dst4, dst5, dst6, dst7, 6);
 
 5165         MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
 
 5166         MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
 
 5167         MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
 
 5168         MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
 
 5171         PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
 
 5173         ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
 
 5174         ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
 
 5177         ST4x8_UB(out0, out1, dst, dst_stride);
 
 5178         dst += (8 * dst_stride);
 
 5182         dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
 
 5190                                       const int8_t *filter_x,
 
 5191                                       const int8_t *filter_y,
 
 5199                                       filter_x, filter_y, 2, weight, offset,
 
 5203                                           filter_x, filter_y, height, weight,
 
 5204                                           offset, rnd_val, 2);
 
 5212                                       const int8_t *filter_x,
 
 5213                                       const int8_t *filter_y,
 
 5220                                       filter_x, filter_y, height, weight,
 
 5221                                       offset, rnd_val, 3);
 
 5228                                       const int8_t *filter_x,
 
 5229                                       const int8_t *filter_y,
 
 5236                                       filter_x, filter_y, height, weight,
 
 5237                                       offset, rnd_val, 4);
 
 5240 #define UNIWGT_MC_COPY(WIDTH)                                                \ 
 5241 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \ 
 5242                                                       ptrdiff_t dst_stride,  \ 
 5244                                                       ptrdiff_t src_stride,  \ 
 5253     int shift = denom + 14 - 8;                                              \ 
 5254     hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \ 
 5255                                     height, weight, offset, shift);          \ 
 5268 #undef UNIWGT_MC_COPY 
 5270 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \ 
 5271 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \ 
 5285     const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \ 
 5286     int shift = denom + 14 - 8;                                               \ 
 5288     hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \ 
 5289                                                  dst_stride, filter, height,  \ 
 5290                                                  weight, offset, shift);      \ 
 5329 #define UNI_W_MC_HV(PEL, WIDTH, TAP)                                          \ 
 5330 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,           \ 
 5331                                                       ptrdiff_t dst_stride,   \ 
 5333                                                       ptrdiff_t src_stride,   \ 
 5342     const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \ 
 5343     const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \ 
 5344     int shift = denom + 14 - 8;                                               \ 
 5346     hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \ 
 5347                                            filter_x, filter_y,  height,       \ 
 5348                                            weight, offset, shift);            \ 
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define XORI_B5_128_SB(...)
 
#define XORI_B8_128_SB(...)
 
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define UNI_W_MC_HV(PEL, WIDTH, TAP)
 
#define XORI_B2_128_SB(...)
 
#define MUL2(in0, in1, in2, in3, out0, out1)
 
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define XORI_B3_128_SB(...)
 
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define UNPCK_R_SB_SH(in, out)
 
#define SPLATI_H2_SH(...)
 
#define CLIP_SW2_0_255_MAX_SATU(in0, in1)
 
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
 
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
 
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width8mult)
 
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define SRA_4V(in0, in1, in2, in3, shift)
 
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
 
#define LD4(psrc, stride, out0, out1, out2, out3)
 
#define SPLATI_W2_SH(...)
 
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define SPLATI_H4_SH(...)
 
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)
 
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static int aligned(int val)
 
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define ST8x2_UB(in, pdst, stride)
 
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define CLIP_SH_0_255_MAX_SATU(in)
 
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define XORI_B7_128_SB(...)
 
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define LW2(psrc, stride, out0, out1)
 
#define XORI_B4_128_SB(...)
 
static const uint8_t offset[127][2]
 
#define SPLATI_W4_SH(...)
 
static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t weightmul16)
 
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define SPLATI_W4_SW(...)
 
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define ST2x4_UB(in, stidx, pdst, stride)
 
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
 
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
 
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static const uint8_t ff_hevc_mask_arr[16 *2]
 
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
 
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
 
#define ADD2(in0, in1, in2, in3, out0, out1)
 
#define INSERT_W2_SB(...)
 
static int weight(int i, int blen, int offset)
 
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define SLLI_4V(in0, in1, in2, in3, shift)
 
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define ST4x8_UB(in0, in1, pdst, stride)
 
#define ST6x4_UB(in0, in1, pdst, stride)
 
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
 
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define LW4(psrc, stride, out0, out1, out2, out3)
 
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define UNIWGT_MC_COPY(WIDTH)
 
#define ST8x4_UB(in0, in1, pdst, stride)
 
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
 
#define INSERT_D2_SB(...)
 
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define XORI_B6_128_SB(...)
 
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
 
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define ST12x4_UB(in0, in1, in2, pdst, stride)
 
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define ST8x1_UB(in, pdst)
 
#define SLLI_2V(in0, in1, shift)
 
#define ST4x2_UB(in, pdst, stride)
 
#define INSERT_W4_SB(...)
 
#define LD2(psrc, stride, out0, out1)
 
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
 
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define CLIP_SH2_0_255_MAX_SATU(in0, in1)
 
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
 
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
 
#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)