25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \ 
   26                            p1_out, p0_out, q0_out, q1_out)               \ 
   28     v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2;         \ 
   29     const v16i8 cnst4b = __msa_ldi_b(4);                                 \ 
   30     const v16i8 cnst3b = __msa_ldi_b(3);                                 \ 
   32     p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \ 
   33     p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \ 
   34     q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \ 
   35     q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \ 
   37     filt = __msa_subs_s_b(p1_m, q1_m);                                   \ 
   39     filt = filt & (v16i8) hev_in;                                        \ 
   41     q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                              \ 
   42     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \ 
   43     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \ 
   44     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \ 
   45     filt = filt & (v16i8) mask_in;                                       \ 
   47     filt1 = __msa_adds_s_b(filt, cnst4b);                                \ 
   50     filt2 = __msa_adds_s_b(filt, cnst3b);                                \ 
   53     q0_m = __msa_subs_s_b(q0_m, filt1);                                  \ 
   54     q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \ 
   55     p0_m = __msa_adds_s_b(p0_m, filt2);                                  \ 
   56     p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \ 
   58     filt = __msa_srari_b(filt1, 1);                                      \ 
   59     hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \ 
   60     filt = filt & (v16i8) hev_in;                                        \ 
   62     q1_m = __msa_subs_s_b(q1_m, filt);                                   \ 
   63     q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \ 
   64     p1_m = __msa_adds_s_b(p1_m, filt);                                   \ 
   65     p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \ 
   68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \ 
   70     v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \ 
   71     v16u8 zero_in = { 0 };                                             \ 
   73     tmp = __msa_ori_b(zero_in, 1);                                     \ 
   74     p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \ 
   75     q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \ 
   76     p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \ 
   77     q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \ 
   79     p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \ 
   80     flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \ 
   81     p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \ 
   82     flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \ 
   84     flat_out = (tmp < (v16u8) flat_out);                               \ 
   85     flat_out = __msa_xori_b(flat_out, 0xff);                           \ 
   86     flat_out = flat_out & (mask);                                      \ 
   89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \ 
   90                   q5_in, q6_in, q7_in, flat_in, flat2_out)          \ 
   92     v16u8 tmp, zero_in = { 0 };                                     \ 
   93     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \ 
   94     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \ 
   96     tmp = __msa_ori_b(zero_in, 1);                                  \ 
   97     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \ 
   98     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \ 
   99     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \ 
  100     q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \ 
  101     p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \ 
  102     q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \ 
  103     p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \ 
  104     q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \ 
  106     p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \ 
  107     flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \ 
  108     flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \ 
  109     p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \ 
  110     flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \ 
  111     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \ 
  112     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \ 
  114     flat2_out = (tmp < (v16u8) flat2_out);                          \ 
  115     flat2_out = __msa_xori_b(flat2_out, 0xff);                      \ 
  116     flat2_out = flat2_out & flat_in;                                \ 
  119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \ 
  120                     q0_in, q1_in, q2_in, q3_in,                \ 
  121                     p2_filt8_out, p1_filt8_out, p0_filt8_out,  \ 
  122                     q0_filt8_out, q1_filt8_out, q2_filt8_out)  \ 
  124     v8u16 tmp0, tmp1, tmp2;                                    \ 
  126     tmp2 = p2_in + p1_in + p0_in;                              \ 
  129     tmp0 = tmp0 + tmp2 + q0_in;                                \ 
  130     tmp1 = tmp0 + p3_in + p2_in;                               \ 
  131     p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \ 
  133     tmp1 = tmp0 + p1_in + q1_in;                               \ 
  134     p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \ 
  136     tmp1 = q2_in + q1_in + q0_in;                              \ 
  137     tmp2 = tmp2 + tmp1;                                        \ 
  138     tmp0 = tmp2 + (p0_in);                                     \ 
  139     tmp0 = tmp0 + (p3_in);                                     \ 
  140     p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \ 
  142     tmp0 = q2_in + q3_in;                                      \ 
  143     tmp0 = p0_in + tmp1 + tmp0;                                \ 
  144     tmp1 = q3_in + q3_in;                                      \ 
  145     tmp1 = tmp1 + tmp0;                                        \ 
  146     q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \ 
  148     tmp0 = tmp2 + q3_in;                                       \ 
  149     tmp1 = tmp0 + q0_in;                                       \ 
  150     q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \ 
  152     tmp1 = tmp0 - p2_in;                                       \ 
  153     tmp0 = q1_in + q3_in;                                      \ 
  154     tmp1 = tmp0 + tmp1;                                        \ 
  155     q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \ 
  158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \ 
  159                      q0_in, q1_in, q2_in, q3_in,                   \ 
  160                      limit_in, b_limit_in, thresh_in,              \ 
  161                      hev_out, mask_out, flat_out)                  \ 
  163     v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \ 
  164     v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \ 
  167     p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \ 
  168     p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \ 
  169     p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \ 
  170     q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \ 
  171     q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \ 
  172     q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \ 
  173     p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \ 
  174     p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \ 
  177     flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \ 
  178     hev_out = thresh_in < (v16u8) flat_out;                        \ 
  181     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \ 
  182     p1_asub_q1_m >>= 1;                                            \ 
  183     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \ 
  185     mask_out = b_limit_in < p0_asub_q0_m;                          \ 
  186     mask_out = __msa_max_u_b(flat_out, mask_out);                  \ 
  187     p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \ 
  188     mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \ 
  189     q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \ 
  190     mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \ 
  192     mask_out = limit_in < (v16u8) mask_out;                        \ 
  193     mask_out = __msa_xori_b(mask_out, 0xff);                       \ 
  201     uint64_t p1_d, p0_d, q0_d, q1_d;
 
  203     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0, p1_out, p0_out, q0_out, q1_out;
 
  206     LD_UB8((
src - 4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  208     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  209     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  210     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  212     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  217     p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
 
  218     p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
 
  219     q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
 
  220     q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
 
  221     SD4(p1_d, p0_d, q0_d, q1_d, (
src - 2 * pitch), pitch);
 
  230     v16u8 
mask, 
hev, 
flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
 
  231     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  234     LD_UB8((
src - 4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  236     thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
 
  237     thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
 
  238     thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
 
  240     b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
 
  241     b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
 
  242     b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
 
  244     limit0 = (v16u8) __msa_fill_b(limit_ptr);
 
  245     limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
 
  246     limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
 
  248     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit0, b_limit0, thresh0,
 
  260     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
 
  262     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  263     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  264     v8i16 p2_filter8, p1_filter8, p0_filter8;
 
  265     v8i16 q0_filter8, q1_filter8, q2_filter8;
 
  266     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
 
  270     LD_UB8((
src - 4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  272     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  273     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  274     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  276     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  282     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
  285     if (__msa_test_bz_v(
flat)) {
 
  286         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
 
  287         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
 
  288         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
 
  289         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
 
  290         SD4(p1_d, p0_d, q0_d, q1_d, (
src - 2 * pitch), pitch);
 
  292         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
  293                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 
  295         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
 
  296                     p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 
  300                     zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
 
  305         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, 
flat);
 
  306         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, 
flat);
 
  307         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, 
flat);
 
  308         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, 
flat);
 
  309         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, 
flat);
 
  310         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, 
flat);
 
  312         p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
 
  313         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
 
  314         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
 
  315         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
 
  316         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
 
  317         q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
 
  321         SD4(p2_d, p1_d, p0_d, q0_d, 
src, pitch);
 
  334     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  335     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  337     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
  338     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
  339     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
  340     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
  341     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
  342     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
  346     LD_UB8(
src - (4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  348     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  349     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
 
  350     thresh = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) thresh);
 
  352     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  353     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
 
  354     b_limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) b_limit);
 
  356     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  357     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
 
  358     limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) limit);
 
  361     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  368     if (__msa_test_bz_v(
flat)) {
 
  369         ST_UB4(p1_out, p0_out, q0_out, q1_out, (
src - 2 * pitch), pitch);
 
  371         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
  372                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 
  374         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
  375                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
  377         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
  379         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
  381         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
  382                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
  385         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
 
  386                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
 
  387                     p0_filt8_r, q0_filt8_r);
 
  388         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
 
  389                     q1_filt8_r, q2_filt8_r);
 
  392         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
  393         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
  394         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
  395         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
  396         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
  397         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
  401         ST_UB4(p2_out, p1_out, p0_out, q0_out, 
src, pitch);
 
  413     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  414     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  416     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
  417     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
  418     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
  422     LD_UB8(
src - (4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  424     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  425     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
 
  426     thresh = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) thresh);
 
  428     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  429     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
 
  430     b_limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) b_limit);
 
  432     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  433     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
 
  434     limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) limit);
 
  437     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  443     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
  446     if (__msa_test_bz_v(
flat)) {
 
  447         ST_UB4(p1_out, p0_out, q0_out, q1_out, (
src - 2 * pitch), pitch);
 
  449         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
  450                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 
  452         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
  453                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
  456         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
 
  457                     p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
 
  458                     p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
 
  459         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
 
  460                     q1_filt8_r, q2_filt8_r);
 
  463         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
  464         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
  465         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
  466         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
  467         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
  468         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
  472         ST_UB4(p2_out, p1_out, p0_out, q0_out, 
src, pitch);
 
  484     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  485     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  487     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
  488     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
  489     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
  493     LD_UB8(
src - (4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  495     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  496     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
 
  497     thresh = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) thresh);
 
  499     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  500     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
 
  501     b_limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) b_limit);
 
  503     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  504     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
 
  505     limit = (v16u8) __msa_ilvr_d((v2i64) 
tmp, (v2i64) limit);
 
  508     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  514     flat = (v16u8) __msa_insve_d((v2i64) 
flat, 0, (v2i64) 
zero);
 
  517     if (__msa_test_bz_v(
flat)) {
 
  518         ST_UB4(p1_out, p0_out, q0_out, q1_out, (
src - 2 * pitch), pitch);
 
  520         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
  522         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
  524         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
  525                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
  528         PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
 
  529                     p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
 
  530                     p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
 
  531         PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
 
  532                     q1_filt8_l, q2_filt8_l);
 
  535         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, 
flat);
 
  536         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, 
flat);
 
  537         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, 
flat);
 
  538         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, 
flat);
 
  539         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, 
flat);
 
  540         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, 
flat);
 
  544         ST_UB4(p2_out, p1_out, p0_out, q0_out, 
src, pitch);
 
  557     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
  558     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  560     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
  561     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
  562     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
  563     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
  564     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
  565     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
  569     LD_UB8(
src - (4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  571     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  572     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  573     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  576     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
  583     if (__msa_test_bz_v(
flat)) {
 
  584         ST_UB4(p1_out, p0_out, q0_out, q1_out, (
src - 2 * pitch), pitch);
 
  588         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
  589                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 
  591         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
  592                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
  594         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
  596         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
  598         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
  599                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
  602         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
 
  603                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
 
  604                     p0_filt8_r, q0_filt8_r);
 
  605         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
 
  609         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
  610         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
  611         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
  612         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
  613         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
  614         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
  616         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
 
  617         filter48 += (4 * 16);
 
  618         ST_UB2(q1_out, q2_out, filter48, 16);
 
  619         filter48 += (2 * 16);
 
  628     v16u8 
flat, flat2, filter8;
 
  630     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
  631     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
 
  632     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
 
  633     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
 
  634     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
 
  635     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
 
  636     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
 
  637     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
 
  638     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
 
  639     v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
 
  644     LD_UB8((
src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
 
  646     VP9_FLAT5(p7, p6, p5, p4, p0, 
q0, q4, q5, q6, q7, 
flat, flat2);
 
  649     if (__msa_test_bz_v(flat2)) {
 
  650         LD_UB4(filter48, 16, p2, p1, p0, 
q0);
 
  651         LD_UB2(filter48 + 4 * 16, 16, 
q1, q2);
 
  660         ILVR_B8_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, 
zero, p3, 
zero, p2,
 
  661                    zero, p1, 
zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
 
  662                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
 
  664         q0_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q0);
 
  666         tmp0_r = p7_r_in << 3;
 
  670         tmp1_r = p6_r_in + p5_r_in;
 
  677         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  679         ILVL_B4_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, p7_l_in, p6_l_in,
 
  681         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l_in, p2_l_in,
 
  683         q0_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) 
q0);
 
  685         tmp0_l = p7_l_in << 3;
 
  689         tmp1_l = p6_l_in + p5_l_in;
 
  696         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  698         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  699         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
 
  704         q1_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q1);
 
  705         tmp0_r = p5_r_in - p6_r_in;
 
  709         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  711         q1_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) 
q1);
 
  712         tmp0_l = p5_l_in - p6_l_in;
 
  716         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  718         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  719         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
 
  724         q2_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q2);
 
  725         tmp0_r = p4_r_in - p5_r_in;
 
  729         r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
 
  731         q2_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q2);
 
  732         tmp0_l = p4_l_in - p5_l_in;
 
  736         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  738         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  739         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
 
  744         q3_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q3);
 
  745         tmp0_r = p3_r_in - p4_r_in;
 
  749         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  751         q3_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q3);
 
  752         tmp0_l = p3_l_in - p4_l_in;
 
  756         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  758         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  759         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
 
  764         q4_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q4);
 
  765         filter8 = 
LD_UB(filter48);
 
  766         tmp0_r = p2_r_in - p3_r_in;
 
  770         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  772         q4_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q4);
 
  773         tmp0_l = p2_l_in - p3_l_in;
 
  777         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  779         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  780         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  785         q5_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q5);
 
  786         filter8 = 
LD_UB(filter48 + 16);
 
  787         tmp0_r = p1_r_in - p2_r_in;
 
  791         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  793         q5_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q5);
 
  794         tmp0_l = p1_l_in - p2_l_in;
 
  798         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  800         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  801         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  806         q6_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q6);
 
  807         filter8 = 
LD_UB(filter48 + 32);
 
  808         tmp0_r = p0_r_in - p1_r_in;
 
  812         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  814         q6_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q6);
 
  815         tmp0_l = p0_l_in - p1_l_in;
 
  819         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  821         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  822         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  827         q7_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q7);
 
  828         filter8 = 
LD_UB(filter48 + 48);
 
  829         tmp0_r = q7_r_in - p0_r_in;
 
  833         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  835         q7_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q7);
 
  836         tmp0_l = q7_l_in - p0_l_in;
 
  840         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  842         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  843         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  848         filter8 = 
LD_UB(filter48 + 64);
 
  849         tmp0_r = q7_r_in - q0_r_in;
 
  853         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  855         tmp0_l = q7_l_in - q0_l_in;
 
  859         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  861         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  862         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  867         filter8 = 
LD_UB(filter48 + 80);
 
  868         tmp0_r = q7_r_in - q1_r_in;
 
  872         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  874         tmp0_l = q7_l_in - q1_l_in;
 
  878         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  880         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  881         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
  886         tmp0_r = q7_r_in - q2_r_in;
 
  890         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  892         tmp0_l = q7_l_in - q2_l_in;
 
  896         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  898         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  899         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
 
  904         tmp0_r = q7_r_in - q3_r_in;
 
  908         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  910         tmp0_l = q7_l_in - q3_l_in;
 
  914         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  916         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  917         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
 
  922         tmp0_r = q7_r_in - q4_r_in;
 
  926         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  928         tmp0_l = q7_l_in - q4_l_in;
 
  932         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  934         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  935         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
 
  940         tmp0_r = q7_r_in - q5_r_in;
 
  944         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
  946         tmp0_l = q7_l_in - q5_l_in;
 
  950         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
  952         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
  953         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
 
  967                                           b_limit_ptr, limit_ptr, thresh_ptr);
 
  969     if (0 == early_exit) {
 
  979     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
 
  980     uint64_t dword0, dword1;
 
  981     v16u8 flat2, 
mask, 
hev, 
flat, thresh, b_limit, limit;
 
  982     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0, p7, p6, p5, p4, q4, q5, q6, q7;
 
  983     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
  984     v16u8 p0_filter16, p1_filter16;
 
  985     v8i16 p2_filter8, p1_filter8, p0_filter8;
 
  986     v8i16 q0_filter8, q1_filter8, q2_filter8;
 
  987     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
 
  988     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
 
  990     v8u16 tmp0, tmp1, tmp2;
 
  993     LD_UB8((
src - 4 * pitch), pitch, p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
  995     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
  996     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
  997     limit = (v16u8) __msa_fill_b(limit_ptr);
 
  999     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1005     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
 1008     if (__msa_test_bz_v(
flat)) {
 
 1009         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
 
 1010         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
 
 1011         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
 
 1012         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
 
 1013         SD4(p1_d, p0_d, q0_d, q1_d, 
src - 2 * pitch, pitch);
 
 1016         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero,
 
 1017                    q1, 
zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
 
 1019         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
 
 1020                     p2_filter8, p1_filter8, p0_filter8, q0_filter8,
 
 1021                     q1_filter8, q2_filter8);
 
 1025                     zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
 
 1031         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, 
flat);
 
 1032         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, 
flat);
 
 1033         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, 
flat);
 
 1034         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, 
flat);
 
 1035         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, 
flat);
 
 1036         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, 
flat);
 
 1039         LD_UB4((
src - 8 * pitch), pitch, p7, p6, p5, p4);
 
 1040         LD_UB4(
src + (4 * pitch), pitch, q4, q5, q6, q7);
 
 1042         VP9_FLAT5(p7, p6, p5, p4, p0, 
q0, q4, q5, q6, q7, 
flat, flat2);
 
 1045         if (__msa_test_bz_v(flat2)) {
 
 1046             p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
 
 1047             p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
 
 1048             p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
 
 1049             q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
 
 1050             q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
 
 1051             q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
 
 1053             SD4(p2_d, p1_d, p0_d, q0_d, 
src - 3 * pitch, pitch);
 
 1054             SD(q1_d, 
src + pitch);
 
 1055             SD(q2_d, 
src + 2 * pitch);
 
 1058             ILVR_B8_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, 
zero, q4,
 
 1059                        zero, q5, 
zero, q6, 
zero, q7, p7_r, p6_r, p5_r, p4_r,
 
 1060                        q4_r, q5_r, q6_r, q7_r);
 
 1070             tmp1 = p6_r + p5_r + p4_r + p3_r;
 
 1071             tmp1 += (p2_r + p1_r + p0_r);
 
 1073             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1074             tmp0 = p5_r - p6_r + q1_r - p7_r;
 
 1076             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1078                         p0_filter16, p1_filter16);
 
 1079             p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
 
 1080             p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
 
 1081             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1082             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1089             tmp0 = p4_r - p5_r + q2_r - p7_r;
 
 1090             tmp2 = p3_r - p4_r + q3_r - p7_r;
 
 1092             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1094             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1096                         p0_filter16, p1_filter16);
 
 1097             p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
 
 1098             p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
 
 1099             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1100             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1107             tmp0 = p2_r - p3_r + q4_r - p7_r;
 
 1108             tmp2 = p1_r - p2_r + q5_r - p7_r;
 
 1110             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1112             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1114                         p0_filter16, p1_filter16);
 
 1115             p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
 
 1116             p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
 
 1117             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1118             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1125             tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
 
 1126             tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
 
 1128             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1130             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1132                         p0_filter16, p1_filter16);
 
 1133             p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
 
 1134             p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
 
 1135             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1136             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1143             tmp0 = q7_r - q0_r + q1_r - p6_r;
 
 1144             tmp2 = q7_r - q1_r + q2_r - p5_r;
 
 1146             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1148             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1150                         p0_filter16, p1_filter16);
 
 1151             p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
 
 1152             p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
 
 1153             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1154             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1161             tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
 
 1162             tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
 
 1164             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1166             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1168                         p0_filter16, p1_filter16);
 
 1169             p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
 
 1170             p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
 
 1171             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1172             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1179             tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
 
 1180             tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
 
 1182             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1184             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
 
 1186                         p0_filter16, p1_filter16);
 
 1187             p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
 
 1188             p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
 
 1189             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
 
 1190             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
 
 1204     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1205     v8i16 vec0, vec1, vec2, vec3;
 
 1209     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1210     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1211     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1214                        p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1215     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1222     ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1231     v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
 
 1232     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1233     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
 1234     v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 
 1235     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
 1237     LD_UB8(
src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
 1239            row8, row9, row10, row11, row12, row13, row14, row15);
 
 1242                         row8, row9, row10, row11, row12, row13, row14, row15,
 
 1243                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1245     thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
 
 1246     thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
 
 1247     thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
 
 1249     b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1250     b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
 
 1251     b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
 
 1253     limit0 = (v16u8) __msa_fill_b(limit_ptr);
 
 1254     limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
 
 1255     limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
 
 1257     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit0, b_limit0, thresh0,
 
 1267     ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1268     ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, 
src + 8 * pitch, pitch);
 
 1276     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1277     v16u8 p1_out, p0_out, q0_out, q1_out;
 
 1279     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
 1280     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
 1281     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
 1283     v8i16 vec0, vec1, vec2, vec3, vec4;
 
 1289                        p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1291     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1292     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1293     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1296     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1304     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
 1307     if (__msa_test_bz_v(
flat)) {
 
 1309         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1313         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1315         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
 1316                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 
 1318         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
 1319                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
 1321         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
 
 1322                     p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
 
 1323                     p0_filt8_r, q0_filt8_r);
 
 1324         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
 
 1328         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
 1329         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
 1330         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
 1331         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
 1332         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
 1333         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
 1338         vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) 
q1);
 
 1341         ST_W4(vec2, 0, 1, 2, 3, 
src, pitch);
 
 1342         ST_H4(vec4, 0, 1, 2, 3, 
src + 4, pitch);
 
 1344         ST_W4(vec3, 0, 1, 2, 3, 
src, pitch);
 
 1345         ST_H4(vec4, 4, 5, 6, 7, 
src + 4, pitch);
 
 1355     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1356     v16u8 p1_out, p0_out, q0_out, q1_out;
 
 1358     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
 
 1359     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
 1360     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
 1361     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
 1362     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
 1363     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
 1364     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
 1366     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1370     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
 
 1371     temp_src += (8 * pitch);
 
 1372     LD_UB8(temp_src, pitch, q3, q2, 
q1, 
q0, row12, row13, row14, row15);
 
 1376                         q3, q2, 
q1, 
q0, row12, row13, row14, row15,
 
 1377                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1379     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1380     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
 
 1381     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
 
 1383     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1384     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
 
 1385     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
 
 1387     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1388     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
 
 1389     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
 
 1392     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1401     if (__msa_test_bz_v(
flat)) {
 
 1402         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1404         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1408         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1409         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, 
src + 8 * pitch, pitch);
 
 1411         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
 1412                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 
 1414         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
 1415                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
 1417         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
 1419         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
 1423         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
 1424                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
 1427         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
 
 1428                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
 
 1429                     p0_filt8_r, q0_filt8_r);
 
 1430         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
 
 1434         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
 1435         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
 1436         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
 1437         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
 1438         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
 1439         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
 1448         ST_W4(vec3, 0, 1, 2, 3, 
src, pitch);
 
 1449         ST_H4(vec2, 0, 1, 2, 3, 
src + 4, pitch);
 
 1451         ST_W4(vec4, 0, 1, 2, 3, 
src, pitch);
 
 1452         ST_H4(vec2, 4, 5, 6, 7, 
src + 4, pitch);
 
 1454         ST_W4(vec6, 0, 1, 2, 3, 
src, pitch);
 
 1455         ST_H4(vec5, 0, 1, 2, 3, 
src + 4, pitch);
 
 1457         ST_W4(vec7, 0, 1, 2, 3, 
src, pitch);
 
 1458         ST_H4(vec5, 4, 5, 6, 7, 
src + 4, pitch);
 
 1468     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1469     v16u8 p1_out, p0_out, q0_out, q1_out;
 
 1471     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
 
 1472     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
 1473     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
 1474     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
 1476     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1480     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
 
 1481     temp_src += (8 * pitch);
 
 1482     LD_UB8(temp_src, pitch, q3, q2, 
q1, 
q0, row12, row13, row14, row15);
 
 1486                         q3, q2, 
q1, 
q0, row12, row13, row14, row15,
 
 1487                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1489     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1490     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
 
 1491     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
 
 1493     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1494     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
 
 1495     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
 
 1497     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1498     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
 
 1499     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
 
 1502     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1510     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
 1513     if (__msa_test_bz_v(
flat)) {
 
 1514         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1516         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1520         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1521         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, 
src + 8 * pitch, pitch);
 
 1523         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
 1524                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 
 1526         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
 1527                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
 1530         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
 
 1531                     p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
 
 1532                     p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
 
 1533         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
 
 1534                     q1_filt8_r, q2_filt8_r);
 
 1537         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
 1538         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
 1539         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
 1540         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
 1541         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
 1542         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
 1551         ST_W4(vec3, 0, 1, 2, 3, 
src, pitch);
 
 1552         ST_H4(vec2, 0, 1, 2, 3, 
src + 4, pitch);
 
 1554         ST_W4(vec4, 0, 1, 2, 3, 
src, pitch);
 
 1555         ST_H4(vec2, 4, 5, 6, 7, 
src + 4, pitch);
 
 1557         ST_W4(vec6, 0, 1, 2, 3, 
src, pitch);
 
 1558         ST_H4(vec5, 0, 1, 2, 3, 
src + 4, pitch);
 
 1560         ST_W4(vec7, 0, 1, 2, 3, 
src, pitch);
 
 1561         ST_H4(vec5, 4, 5, 6, 7, 
src + 4, pitch);
 
 1571     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1572     v16u8 p1_out, p0_out, q0_out, q1_out;
 
 1574     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
 
 1575     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
 1576     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
 1577     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
 1579     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1583     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
 
 1584     temp_src += (8 * pitch);
 
 1585     LD_UB8(temp_src, pitch, q3, q2, 
q1, 
q0, row12, row13, row14, row15);
 
 1589                         q3, q2, 
q1, 
q0, row12, row13, row14, row15,
 
 1590                         p3, p2, p1, p0, 
q0, 
q1, q2, q3);
 
 1592     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1593     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
 
 1594     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
 
 1596     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1597     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
 
 1598     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
 
 1600     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1601     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
 
 1602     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
 
 1605     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1613     flat = (v16u8) __msa_insve_d((v2i64) 
flat, 0, (v2i64) 
zero);
 
 1616     if (__msa_test_bz_v(
flat)) {
 
 1617         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1619         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1623         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, 
src, pitch);
 
 1624         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, 
src + 8 * pitch, pitch);
 
 1626         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
 1628         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
 1631         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
 1632                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
 1635         PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
 
 1636                     p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
 
 1637                     p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
 
 1638         PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
 
 1639                     q1_filt8_l, q2_filt8_l);
 
 1642         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, 
flat);
 
 1643         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, 
flat);
 
 1644         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, 
flat);
 
 1645         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, 
flat);
 
 1646         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, 
flat);
 
 1647         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, 
flat);
 
 1656         ST_W4(vec3, 0, 1, 2, 3, 
src, pitch);
 
 1657         ST_H4(vec2, 0, 1, 2, 3, 
src + 4, pitch);
 
 1659         ST_W4(vec4, 0, 1, 2, 3, 
src, pitch);
 
 1660         ST_H4(vec2, 4, 5, 6, 7, 
src + 4, pitch);
 
 1662         ST_W4(vec6, 0, 1, 2, 3, 
src, pitch);
 
 1663         ST_H4(vec5, 0, 1, 2, 3, 
src + 4, pitch);
 
 1665         ST_W4(vec7, 0, 1, 2, 3, 
src, pitch);
 
 1666         ST_H4(vec5, 4, 5, 6, 7, 
src + 4, pitch);
 
 1673     v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
 
 1674     v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 1675     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
 1676     v16i8 zeros = { 0 };
 
 1679            p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
 
 1682                        p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
 
 1684     ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
 
 1685                tmp0, tmp1, tmp2, tmp3);
 
 1686     ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
 
 1687     ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
 
 1690     SLDI_B4_UB(zeros, 
q0, zeros, q2, zeros, q4, zeros, q6, 8, 
q1, q3, q5, q7);
 
 1692     ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, 
output, out_pitch);
 
 1693     output += (8 * out_pitch);
 
 1700     v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
 
 1701     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
 1703     LD_UB8(
input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
 
 1704     LD_UB8(
input + (8 * in_pitch), in_pitch, 
q0, 
q1, q2, q3, q4, q5, q6, q7);
 
 1705     TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5,
 
 1706                         q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
 
 1707     ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, 
output, out_pitch);
 
 1713     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
 1714     v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 
 1715     v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
 
 1717     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
 1719     LD_UB8(
input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 
 1720     input += (8 * in_pitch);
 
 1722            row8, row9, row10, row11, row12, row13, row14, row15);
 
 1725                         row8, row9, row10, row11, row12, row13, row14, row15,
 
 1726                         p7, p6, p5, p4, p3, p2, p1, p0);
 
 1730     q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
 
 1731     q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
 
 1732     q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
 
 1733     q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
 
 1734     q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
 
 1735     q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
 
 1736     q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
 
 1737     q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
 
 1740     tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
 
 1741     tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
 
 1744     tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
 
 1745     tmp7 = (v8i16) __msa_ilvod_b((v16i8) 
q0, (v16i8) 
q1);
 
 1748     q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
 
 1749     q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
 
 1751     tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
 
 1752     tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
 
 1753     q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
 
 1754     q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
 
 1757     q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
 
 1758     q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
 
 1760     tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
 
 1761     tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
 
 1762     q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
 
 1763     q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
 
 1765     ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, 
output, out_pitch);
 
 1766     output += (8 * out_pitch);
 
 1776     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 1777     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
 1779     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
 1780     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
 1781     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
 1783     v8i16 vec0, vec1, vec2, vec3;
 
 1788     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 1789     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 1790     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 1793     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 1801     flat = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) 
flat);
 
 1804     if (__msa_test_bz_v(
flat)) {
 
 1805         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 1807         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
 
 1810         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
 1811                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 
 1813         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
 1814                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
 1817         p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
 
 1818         p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
 
 1819         p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
 
 1820         q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
 
 1821         q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
 
 1822         q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
 
 1825         p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, 
flat);
 
 1826         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, 
flat);
 
 1827         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, 
flat);
 
 1828         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, 
flat);
 
 1829         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, 
flat);
 
 1830         q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, 
flat);
 
 1832         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
 
 1833         filter48 += (4 * 16);
 
 1834         ST_UB2(q1_out, q2_out, filter48, 16);
 
 1835         filter48 += (2 * 16);
 
 1846     v16u8 filter8, 
flat, flat2;
 
 1847     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
 1848     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
 
 1849     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
 
 1850     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
 
 1851     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
 
 1852     v8u16 tmp0_r, tmp1_r;
 
 1857     LD_UB8((
src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
 
 1860     VP9_FLAT5(p7, p6, p5, p4, p0, 
q0, q4, q5, q6, q7, 
flat, flat2);
 
 1863     if (__msa_test_bz_v(flat2)) {
 
 1864         v8i16 vec0, vec1, vec2, vec3, vec4;
 
 1866         LD_UB4(filter48, 16, p2, p1, p0, 
q0);
 
 1867         LD_UB2(filter48 + 4 * 16, 16, 
q1, q2);
 
 1871         vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) 
q1);
 
 1874         ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
 
 1875         ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
 
 1876         src_org += (4 * pitch);
 
 1877         ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
 
 1878         ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
 
 1884         ILVR_B8_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, 
zero, p3, 
zero, p2,
 
 1885                    zero, p1, 
zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
 
 1886                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
 
 1887         q0_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q0);
 
 1889         tmp0_r = p7_r_in << 3;
 
 1893         tmp1_r = p6_r_in + p5_r_in;
 
 1901         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1902         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1903         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
 
 1908         q1_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q1);
 
 1909         tmp0_r = p5_r_in - p6_r_in;
 
 1913         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1914         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1915         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
 
 1920         q2_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q2);
 
 1921         tmp0_r = p4_r_in - p5_r_in;
 
 1925         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1926         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1927         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
 
 1932         q3_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q3);
 
 1933         tmp0_r = p3_r_in - p4_r_in;
 
 1937         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1938         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1939         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
 
 1944         q4_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q4);
 
 1945         filter8 = 
LD_UB(filter48);
 
 1946         tmp0_r = p2_r_in - p3_r_in;
 
 1950         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1951         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1952         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 1957         q5_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q5);
 
 1958         filter8 = 
LD_UB(filter48 + 16);
 
 1959         tmp0_r = p1_r_in - p2_r_in;
 
 1963         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1964         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1965         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 1970         q6_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q6);
 
 1971         filter8 = 
LD_UB(filter48 + 32);
 
 1972         tmp0_r = p0_r_in - p1_r_in;
 
 1976         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1977         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1978         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 1983         q7_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q7);
 
 1984         filter8 = 
LD_UB(filter48 + 48);
 
 1985         tmp0_r = q7_r_in - p0_r_in;
 
 1989         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 1990         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 1991         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 1996         filter8 = 
LD_UB(filter48 + 64);
 
 1997         tmp0_r = q7_r_in - q0_r_in;
 
 2001         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2002         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2003         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2008         filter8 = 
LD_UB(filter48 + 80);
 
 2009         tmp0_r = q7_r_in - q1_r_in;
 
 2013         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2014         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2015         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2020         tmp0_r = q7_r_in - q2_r_in;
 
 2024         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2025         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2026         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
 
 2031         tmp0_r = q7_r_in - q3_r_in;
 
 2035         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2036         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2037         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
 
 2042         tmp0_r = q7_r_in - q4_r_in;
 
 2046         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2047         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2048         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
 
 2053         tmp0_r = q7_r_in - q5_r_in;
 
 2057         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2058         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
 
 2059         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
 
 2073     uint8_t *filter48 = &transposed_input[16 * 16];
 
 2078                                          &filter48[0], 
src, pitch,
 
 2079                                          b_limit_ptr, limit_ptr, thresh_ptr);
 
 2081     if (0 == early_exit) {
 
 2085         if (0 == early_exit) {
 
 2092                                         uint8_t *src_org, ptrdiff_t pitch,
 
 2097     v16u8 p3, p2, p1, p0, q3, q2, 
q1, 
q0;
 
 2098     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 
 2100     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
 
 2101     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
 
 2102     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
 
 2103     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
 
 2104     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
 
 2105     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
 
 2107     v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
 
 2112     thresh = (v16u8) __msa_fill_b(thresh_ptr);
 
 2113     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
 
 2114     limit = (v16u8) __msa_fill_b(limit_ptr);
 
 2117     LPF_MASK_HEV(p3, p2, p1, p0, 
q0, 
q1, q2, q3, limit, b_limit, thresh,
 
 2126     if (__msa_test_bz_v(
flat)) {
 
 2127         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 2129         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
 
 2133         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
 
 2134         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
 
 2138         ILVR_B8_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, 
zero, 
q0, 
zero, 
q1,
 
 2139                    zero, q2, 
zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 
 2141         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 
 2142                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
 2143         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l, p2_l, p1_l,
 
 2145         ILVL_B4_UH(
zero, 
q0, 
zero, 
q1, 
zero, q2, 
zero, q3, q0_l, q1_l, q2_l,
 
 2147         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
 
 2148                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
 
 2151         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
 
 2152                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
 
 2153                     p0_filt8_r, q0_filt8_r);
 
 2154         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
 
 2158         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, 
flat);
 
 2159         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, 
flat);
 
 2160         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, 
flat);
 
 2161         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, 
flat);
 
 2162         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, 
flat);
 
 2163         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, 
flat);
 
 2165         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
 
 2166         filter48 += (4 * 16);
 
 2167         ST_UB2(q1_out, q2_out, filter48, 16);
 
 2168         filter48 += (2 * 16);
 
 2178     v16u8 
flat, flat2, filter8;
 
 2180     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, 
q0, 
q1, q2, q3, q4, q5, q6, q7;
 
 2181     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
 
 2182     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
 
 2183     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
 
 2184     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
 
 2185     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
 
 2186     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
 
 2187     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
 
 2188     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
 
 2189     v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
 
 2194     LD_UB8((
src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
 
 2197     VP9_FLAT5(p7, p6, p5, p4, p0, 
q0, q4, q5, q6, q7, 
flat, flat2);
 
 2200     if (__msa_test_bz_v(flat2)) {
 
 2201         v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2203         LD_UB4(filter48, 16, p2, p1, p0, 
q0);
 
 2204         LD_UB2(filter48 + 4 * 16, 16, 
q1, q2);
 
 2213         ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
 
 2214         ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
 
 2215         src_org += (4 * pitch);
 
 2216         ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
 
 2217         ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
 
 2218         src_org += (4 * pitch);
 
 2219         ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
 
 2220         ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
 
 2221         src_org += (4 * pitch);
 
 2222         ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
 
 2223         ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
 
 2229         ILVR_B8_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, 
zero, p3, 
zero, p2,
 
 2230                    zero, p1, 
zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
 
 2231                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
 
 2232         q0_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q0);
 
 2234         tmp0_r = p7_r_in << 3;
 
 2238         tmp1_r = p6_r_in + p5_r_in;
 
 2245         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2247         ILVL_B4_UH(
zero, p7, 
zero, p6, 
zero, p5, 
zero, p4, p7_l_in, p6_l_in,
 
 2249         ILVL_B4_UH(
zero, p3, 
zero, p2, 
zero, p1, 
zero, p0, p3_l_in, p2_l_in,
 
 2251         q0_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) 
q0);
 
 2253         tmp0_l = p7_l_in << 3;
 
 2257         tmp1_l = p6_l_in + p5_l_in;
 
 2264         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2266         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2267         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
 
 2272         q1_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) 
q1);
 
 2273         tmp0_r = p5_r_in - p6_r_in;
 
 2277         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2278         q1_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) 
q1);
 
 2279         tmp0_l = p5_l_in - p6_l_in;
 
 2283         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2284         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2285         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
 
 2290         q2_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q2);
 
 2291         tmp0_r = p4_r_in - p5_r_in;
 
 2295         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2296         q2_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q2);
 
 2297         tmp0_l = p4_l_in - p5_l_in;
 
 2301         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2302         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2303         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
 
 2308         q3_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q3);
 
 2309         tmp0_r = p3_r_in - p4_r_in;
 
 2313         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2314         q3_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q3);
 
 2315         tmp0_l = p3_l_in - p4_l_in;
 
 2319         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2320         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2321         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
 
 2326         q4_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q4);
 
 2327         filter8 = 
LD_UB(filter48);
 
 2328         tmp0_r = p2_r_in - p3_r_in;
 
 2332         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2333         q4_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q4);
 
 2334         tmp0_l = p2_l_in - p3_l_in;
 
 2338         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2339         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2340         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2345         q5_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q5);
 
 2346         filter8 = 
LD_UB(filter48 + 16);
 
 2347         tmp0_r = p1_r_in - p2_r_in;
 
 2351         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2352         q5_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q5);
 
 2353         tmp0_l = p1_l_in - p2_l_in;
 
 2357         l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
 
 2358         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2359         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2364         q6_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q6);
 
 2365         filter8 = 
LD_UB(filter48 + 32);
 
 2366         tmp0_r = p0_r_in - p1_r_in;
 
 2370         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2371         q6_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q6);
 
 2372         tmp0_l = p0_l_in - p1_l_in;
 
 2376         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2377         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2378         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2383         q7_r_in = (v8u16) __msa_ilvr_b(
zero, (v16i8) q7);
 
 2384         filter8 = 
LD_UB(filter48 + 48);
 
 2385         tmp0_r = q7_r_in - p0_r_in;
 
 2389         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2390         q7_l_in = (v8u16) __msa_ilvl_b(
zero, (v16i8) q7);
 
 2391         tmp0_l = q7_l_in - p0_l_in;
 
 2395         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2396         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2397         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2402         filter8 = 
LD_UB(filter48 + 64);
 
 2403         tmp0_r = q7_r_in - q0_r_in;
 
 2407         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2408         tmp0_l = q7_l_in - q0_l_in;
 
 2412         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2413         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2414         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2419         filter8 = 
LD_UB(filter48 + 80);
 
 2420         tmp0_r = q7_r_in - q1_r_in;
 
 2424         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2425         tmp0_l = q7_l_in - q1_l_in;
 
 2429         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2430         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2431         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
 
 2436         tmp0_r = q7_r_in - q2_r_in;
 
 2440         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2441         tmp0_l = q7_l_in - q2_l_in;
 
 2445         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2446         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2447         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
 
 2452         tmp0_r = q7_r_in - q3_r_in;
 
 2456         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2457         tmp0_l = q7_l_in - q3_l_in;
 
 2461         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2462         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2463         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
 
 2468         tmp0_r = q7_r_in - q4_r_in;
 
 2472         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2473         tmp0_l = q7_l_in - q4_l_in;
 
 2477         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2478         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2479         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
 
 2484         tmp0_r = q7_r_in - q5_r_in;
 
 2488         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
 
 2489         tmp0_l = q7_l_in - q5_l_in;
 
 2493         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
 
 2494         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
 
 2495         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
 
 2509     uint8_t *filter48 = &transposed_input[16 * 16];
 
 2514                                           &filter48[0], 
src, pitch,
 
 2515                                           b_limit_ptr, limit_ptr, thresh_ptr);
 
 2517     if (0 == early_exit) {
 
 2521         if (0 == early_exit) {