28                                       const uint8_t *p_is_pcm, 
const uint8_t *q_is_pcm)
 
   30     ptrdiff_t stride_2x = (
stride << 1);
 
   31     ptrdiff_t stride_4x = (
stride << 2);
 
   32     ptrdiff_t stride_3x = stride_2x + 
stride;
 
   33     uint8_t *p3 = 
src - stride_4x;
 
   34     uint8_t *p2 = 
src - stride_3x;
 
   35     uint8_t *p1 = 
src - stride_2x;
 
   39     uint8_t *q2 = 
src + stride_2x;
 
   40     uint8_t *q3 = 
src + stride_3x;
 
   42     int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
 
   43     int32_t dp04, dq04, dp34, dq34, d04, d34;
 
   44     int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
 
   47     __m128i dst0, dst1, dst2, dst3, dst4, dst5;
 
   48     __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
 
   50     __m128i temp2, tc_pos, tc_neg;
 
   51     __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
 
   53     __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
 
   55     dp00 = 
abs(p2[0] - (p1[0] << 1) + p0[0]);
 
   56     dq00 = 
abs(q2[0] - (
q1[0] << 1) + 
q0[0]);
 
   57     dp30 = 
abs(p2[3] - (p1[3] << 1) + p0[3]);
 
   58     dq30 = 
abs(q2[3] - (
q1[3] << 1) + 
q0[3]);
 
   61     dp04 = 
abs(p2[4] - (p1[4] << 1) + p0[4]);
 
   62     dq04 = 
abs(q2[4] - (
q1[4] << 1) + 
q0[4]);
 
   63     dp34 = 
abs(p2[7] - (p1[7] << 1) + p0[7]);
 
   64     dq34 = 
abs(q2[7] - (
q1[7] << 1) + 
q0[7]);
 
   68     p_is_pcm0 = p_is_pcm[0];
 
   69     p_is_pcm4 = p_is_pcm[1];
 
   70     q_is_pcm0 = q_is_pcm[0];
 
   71     q_is_pcm4 = q_is_pcm[1];
 
   73     DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
 
   74     p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
   75     p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
   76     d0030 = (d00 + d30) >= beta;
 
   77     d0434 = (d04 + d34) >= beta;
 
   78     DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
 
   79     cmp3 = __lsx_vpackev_w(cmp1, cmp0);
 
   80     cmp3 = __lsx_vseqi_w(cmp3, 0);
 
   82     if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
 
   84         DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
 
   85                   p3_src, p2_src, p1_src, p0_src);
 
   86         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
 
   87         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
   88         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
   93         tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
 
   95         tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
 
   97         DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
 
   99                   p0_src, p3_src, p2_src, p1_src, p0_src);
 
  101                   q0_src, q1_src, q2_src, q3_src);
 
  102         flag0 = 
abs(p3[0] - p0[0]) + 
abs(q3[0] - 
q0[0]) < beta30 &&
 
  103                 abs(p0[0] - 
q0[0]) < tc250;
 
  104         flag0 = flag0 && (
abs(p3[3] - p0[3]) + 
abs(q3[3] - 
q0[3]) < beta30 &&
 
  105                 abs(p0[3] - 
q0[3]) < tc250 && (d00 << 1) < beta20 &&
 
  106                 (d30 << 1) < beta20);
 
  107         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
 
  109                   zero, q3_src, q0_src, q1_src, q2_src, q3_src);
 
  111         flag1 = 
abs(p3[4] - p0[4]) + 
abs(q3[4] - 
q0[4]) < beta30 &&
 
  112                 abs(p0[4] - 
q0[4]) < tc254;
 
  113         flag1 = flag1 && (
abs(p3[7] - p0[7]) + 
abs(q3[7] - 
q0[7]) < beta30 &&
 
  114                 abs(p0[7] - 
q0[7]) < tc254 && (d04 << 1) < beta20 &&
 
  115                 (d34 << 1) < beta20);
 
  116         DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
 
  117         cmp2 = __lsx_vpackev_w(cmp1, cmp0);
 
  118         cmp2 = __lsx_vseqi_w(cmp2, 0);
 
  120         if (flag0 && flag1) { 
 
  122             tc_pos = __lsx_vslli_h(tc_pos, 1);
 
  123             tc_neg = __lsx_vneg_h(tc_pos);
 
  126             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
 
  128             temp1 = __lsx_vadd_h(p3_src, p2_src);
 
  129             temp1 = __lsx_vslli_h(temp1, 1);
 
  130             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
 
  131             temp1 = __lsx_vsrari_h(temp1, 3);
 
  132             temp2 = __lsx_vsub_h(temp1, p2_src);
 
  133             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  134             dst0 = __lsx_vadd_h(temp2, p2_src);
 
  136             temp1 = __lsx_vadd_h(temp0, p2_src);
 
  137             temp1 = __lsx_vsrari_h(temp1, 2);
 
  138             temp2 = __lsx_vsub_h(temp1, p1_src);
 
  139             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  140             dst1 = __lsx_vadd_h(temp2, p1_src);
 
  142             temp1 = __lsx_vslli_h(temp0, 1);
 
  143             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
 
  145             temp1 = __lsx_vsrari_h(temp1, 3);
 
  146             temp2 = __lsx_vsub_h(temp1, p0_src);
 
  147             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  148             dst2 = __lsx_vadd_h(temp2, p0_src);
 
  150             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  151             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
 
  152                       p1_src, p_is_pcm_vec, dst0, dst1);
 
  153             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
 
  156             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
 
  158             temp1 = __lsx_vadd_h(q3_src, q2_src);
 
  159             temp1 = __lsx_vslli_h(temp1, 1);
 
  160             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
 
  161             temp1 = __lsx_vsrari_h(temp1, 3);
 
  162             temp2 = __lsx_vsub_h(temp1, q2_src);
 
  163             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  164             dst5 = __lsx_vadd_h(temp2, q2_src);
 
  166             temp1 = __lsx_vadd_h(temp0, q2_src);
 
  167             temp1 = __lsx_vsrari_h(temp1, 2);
 
  168             temp2 = __lsx_vsub_h(temp1, q1_src);
 
  169             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  170             dst4 = __lsx_vadd_h(temp2, q1_src);
 
  172             temp0 = __lsx_vslli_h(temp0, 1);
 
  173             DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
 
  175             temp1 = __lsx_vsrari_h(temp1, 3);
 
  176             temp2 = __lsx_vsub_h(temp1, q0_src);
 
  177             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  178             dst3 = __lsx_vadd_h(temp2, q0_src);
 
  180             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  181             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
 
  182                       q1_src, q_is_pcm_vec, dst3, dst4);
 
  183             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
 
  186             DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
 
  187             dst2 = __lsx_vpickev_b(dst5, dst4);
 
  190             DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
 
  192             dst5 = __lsx_vpickev_b(q2_src, q1_src);
 
  194             cmp3 = __lsx_vnor_v(cmp3, cmp3);
 
  195             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
 
  197             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
 
  199             __lsx_vstelm_d(dst0, p2, 0, 0);
 
  200             __lsx_vstelm_d(dst0, p2 + 
stride, 0, 1);
 
  201             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
 
  202             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
 
  203             __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
 
  204             __lsx_vstelm_d(dst2, p2 + stride_4x + 
stride, 0, 1);
 
  206         } 
else if (flag0 == flag1) { 
 
  208             tc_neg = __lsx_vneg_h(tc_pos);
 
  209             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
 
  211             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
 
  212                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
 
  213             delta0 = __lsx_vsub_h(diff0, diff1);
 
  214             delta0 = __lsx_vsrari_h(delta0, 4);
 
  215             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
 
  216                                  __lsx_vslli_h(tc_pos, 1));
 
  217             abs_delta0 = __lsx_vadda_h(delta0, 
zero);
 
  218             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
 
  219             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  221             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
 
  222             temp2 = __lsx_vadd_h(delta0, p0_src);
 
  223             temp2 = __lsx_vclip255_h(temp2);
 
  224             temp0 = __lsx_vbitsel_v(temp2, p0_src,
 
  225                                     __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
 
  226             temp2 = __lsx_vsub_h(q0_src, delta0);
 
  227             temp2 = __lsx_vclip255_h(temp2);
 
  228             temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
 
  230             DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
 
  231                       q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
 
  233             tmp = (beta + (beta >> 1)) >> 3;
 
  236             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
 
  237             cmp0 = __lsx_vseqi_d(cmp0, 0);
 
  238             p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
 
  242             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
 
  243             cmp0 = __lsx_vseqi_d(cmp0, 0);
 
  244             q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
 
  245             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  246             tc_neg = __lsx_vneg_h(tc_pos);
 
  248             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
 
  250             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
 
  252             delta1 = __lsx_vadd_h(delta1, delta0);
 
  253             delta2 = __lsx_vsub_h(delta2, delta0);
 
  254             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
 
  255             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
 
  256                       tc_neg, tc_pos, delta1, delta2);
 
  257             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
 
  259             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
 
  260             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
 
  261                       q1_src, q_is_pcm_vec, delta1, delta2);
 
  263             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  264             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
 
  265                       p0_src,  abs_delta0, temp2, q0_src, abs_delta0, delta2,
 
  266                       q1_src, abs_delta0, dst1, dst2, dst3, dst4);
 
  268             DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
 
  270             DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
 
  272             cmp3 = __lsx_vnor_v(cmp3, cmp3);
 
  273             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
 
  277             __lsx_vstelm_d(dst0, p2, 0, 0);
 
  278             __lsx_vstelm_d(dst0, p2 + 
stride, 0, 1);
 
  279             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
 
  280             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
 
  284             tc_pos = __lsx_vslli_h(tc_pos, 1);
 
  285             tc_neg = __lsx_vneg_h(tc_pos);
 
  288             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
 
  290             temp1 = __lsx_vadd_h(p3_src, p2_src);
 
  291             temp1 = __lsx_vslli_h(temp1, 1);
 
  292             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
 
  293             temp1 = __lsx_vsrari_h(temp1, 3);
 
  294             temp2 = __lsx_vsub_h(temp1, p2_src);
 
  295             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  296             dst0 = __lsx_vadd_h(temp2, p2_src);
 
  298             temp1 = __lsx_vadd_h(temp0, p2_src);
 
  299             temp1 = __lsx_vsrari_h(temp1, 2);
 
  300             temp2 = __lsx_vsub_h(temp1, p1_src);
 
  301             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  302             dst1 = __lsx_vadd_h(temp2, p1_src);
 
  304             temp1 = __lsx_vslli_h(temp0, 1);
 
  305             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
 
  306             temp1 = __lsx_vsrari_h(temp1, 3);
 
  307             temp2 = __lsx_vsub_h(temp1, p0_src);
 
  308             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  309             dst2 = __lsx_vadd_h(temp2, p0_src);
 
  311             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  312             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
 
  313                       p1_src, p_is_pcm_vec, dst0, dst1);
 
  314             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
 
  317             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
 
  319             temp1 = __lsx_vadd_h(q3_src, q2_src);
 
  320             temp1 = __lsx_vslli_h(temp1, 1);
 
  321             DUP2_ARG2(__lsx_vadd_h, temp1,  q2_src, temp1, temp0, temp1, temp1);
 
  322             temp1 = __lsx_vsrari_h(temp1, 3);
 
  323             temp2 = __lsx_vsub_h(temp1, q2_src);
 
  324             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  325             dst5 = __lsx_vadd_h(temp2, q2_src);
 
  327             temp1 = __lsx_vadd_h(temp0, q2_src);
 
  328             temp1 = __lsx_vsrari_h(temp1, 2);
 
  329             temp2 = __lsx_vsub_h(temp1, q1_src);
 
  330             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  331             dst4 = __lsx_vadd_h(temp2, q1_src);
 
  333             temp1 = __lsx_vslli_h(temp0, 1);
 
  334             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
 
  335             temp1 = __lsx_vsrari_h(temp1, 3);
 
  336             temp2 = __lsx_vsub_h(temp1, q0_src);
 
  337             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  338             dst3 = __lsx_vadd_h(temp2, q0_src);
 
  340             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  341             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
 
  342                       q1_src, q_is_pcm_vec, dst3, dst4);
 
  343             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
 
  346             DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
 
  347             dst2 = __lsx_vpickev_b(dst5, dst4);
 
  351             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  352             tc_neg = __lsx_vneg_h(tc_pos);
 
  354             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
 
  356             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
 
  357                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
 
  358             delta0 = __lsx_vsub_h(diff0, diff1);
 
  359             delta0 = __lsx_vsrari_h(delta0, 4);
 
  360             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
 
  361                                  __lsx_vslli_h(tc_pos, 1));
 
  362             abs_delta0 = __lsx_vadda_h(delta0, 
zero);
 
  363             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
 
  364             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  366             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
 
  367             temp2 = __lsx_vadd_h(delta0, p0_src);
 
  368             temp2 = __lsx_vclip255_h(temp2);
 
  369             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
 
  371             temp2 = __lsx_vsub_h(q0_src, delta0);
 
  372             temp2 = __lsx_vclip255_h(temp2);
 
  373             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
 
  375             tmp = (beta + (beta >> 1)) >> 3;
 
  378             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
 
  379             p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
 
  382             cmp0 = __lsx_vpackev_d(cmp1, cmp0);
 
  383             q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
 
  385             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  386             tc_neg = __lsx_vneg_h(tc_pos);
 
  388             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
 
  390             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
 
  392             delta1 = __lsx_vadd_h(delta1, delta0);
 
  393             delta2 = __lsx_vsub_h(delta2, delta0);
 
  394             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
 
  395             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
 
  396                       tc_pos, delta1, delta2);
 
  397             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
 
  399             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
 
  400             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
 
  401                       q1_src, q_is_pcm_vec, delta1, delta2);
 
  402             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  403             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
 
  404                       q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
 
  405                       q0_src, abs_delta0, delta1, delta2, temp0, temp2);
 
  409             DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
 
  411             dst5 = __lsx_vpickev_b(q2_src, delta2);
 
  414             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
 
  416             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
 
  419             DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
 
  421             dst5 = __lsx_vpickev_b(q2_src, q1_src);
 
  423             cmp3 = __lsx_vnor_v(cmp3, cmp3);
 
  424             DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
 
  426             dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
 
  428             __lsx_vstelm_d(dst0, p2, 0, 0);
 
  429             __lsx_vstelm_d(dst0, p2 + 
stride, 0, 1);
 
  430             __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
 
  431             __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
 
  432             __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
 
  433             __lsx_vstelm_d(dst2, p2 + stride_4x + 
stride, 0, 1);
 
  440                                       const uint8_t *p_is_pcm, 
const uint8_t *q_is_pcm)
 
  442     ptrdiff_t stride_2x = (
stride << 1);
 
  443     ptrdiff_t stride_4x = (
stride << 2);
 
  444     ptrdiff_t stride_3x = stride_2x + 
stride;
 
  446     uint8_t *p2 = 
src + stride_3x;
 
  447     uint8_t *p1 = 
src + stride_4x;
 
  448     uint8_t *p0 = 
src + stride_4x + stride_3x;
 
  449     uint8_t flag0, flag1;
 
  450     int32_t dp00, dq00, dp30, dq30, d00, d30;
 
  452     int32_t dp04, dq04, dp34, dq34, d04, d34;
 
  453     int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
 
  454     int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, 
tmp;
 
  456     __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
  457     __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
 
  459     __m128i temp0, temp1;
 
  461     __m128i tc_pos, tc_neg;
 
  462     __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
 
  464     __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
 
  466     dp00 = 
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
 
  467     dq00 = 
abs(p3[2] - (p3[1] << 1) + p3[0]);
 
  468     dp30 = 
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
 
  469     dq30 = 
abs(p2[2] - (p2[1] << 1) + p2[0]);
 
  472     p_is_pcm0 = p_is_pcm[0];
 
  473     q_is_pcm0 = q_is_pcm[0];
 
  475     dp04 = 
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
 
  476     dq04 = 
abs(p1[2] - (p1[1] << 1) + p1[0]);
 
  477     dp34 = 
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
 
  478     dq34 = 
abs(p0[2] - (p0[1] << 1) + p0[0]);
 
  481     p_is_pcm4 = p_is_pcm[1];
 
  482     q_is_pcm4 = q_is_pcm[1];
 
  484     DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
 
  485     p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  486     p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
  488     d0030 = (d00 + d30) >= beta;
 
  489     d0434 = (d04 + d34) >= beta;
 
  491     DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
 
  492     cmp3 = __lsx_vpackev_d(cmp1, cmp0);
 
  493     cmp3 = __lsx_vseqi_d(cmp3, 0);
 
  495     if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
 
  496         (!d0030 || !d0434)) {
 
  499                   src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
 
  502                   src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
 
  505         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
 
  506         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  507         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
  512         tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
 
  514         tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
 
  515         DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
 
  516         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
 
  517         LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
 
  518                            q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
 
  519                            q0_src, q1_src, q2_src, q3_src);
 
  521         flag0 = 
abs(p3[-4] - p3[-1]) + 
abs(p3[3] - p3[0]) < beta30 &&
 
  522                 abs(p3[-1] - p3[0]) < tc250;
 
  523         flag0 = flag0 && (
abs(p2[-4] - p2[-1]) + 
abs(p2[3] - p2[0]) < beta30 &&
 
  524                 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
 
  525                 (d30 << 1) < beta20);
 
  526         cmp0 = __lsx_vreplgr2vr_d(flag0);
 
  528                   p0_src, p3_src, p2_src, p1_src, p0_src);
 
  530         flag1 = 
abs(p1[-4] - p1[-1]) + 
abs(p1[3] - p1[0]) < beta30 &&
 
  531                 abs(p1[-1] - p1[0]) < tc254;
 
  532         flag1 = flag1 && (
abs(p0[-4] - p0[-1]) + 
abs(p0[3] - p0[0]) < beta30 &&
 
  533                 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
 
  534                 (d34 << 1) < beta20);
 
  536                   q3_src, q0_src, q1_src, q2_src, q3_src);
 
  538         cmp1 = __lsx_vreplgr2vr_d(flag1);
 
  539         cmp2 = __lsx_vpackev_d(cmp1, cmp0);
 
  540         cmp2 = __lsx_vseqi_d(cmp2, 0);
 
  542         if (flag0 && flag1) { 
 
  544             tc_neg = __lsx_vneg_h(tc_pos);
 
  546             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
 
  548             temp1 = __lsx_vadd_h(p3_src, p2_src);
 
  549             temp1 = __lsx_vslli_h(temp1, 1);
 
  550             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
 
  551             temp1 = __lsx_vsrari_h(temp1, 3);
 
  552             temp2 = __lsx_vsub_h(temp1, p2_src);
 
  553             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  554             dst0 = __lsx_vadd_h(temp2, p2_src);
 
  556             temp1 = __lsx_vadd_h(temp0, p2_src);
 
  557             temp1 = __lsx_vsrari_h(temp1, 2);
 
  558             temp2 = __lsx_vsub_h(temp1, p1_src);
 
  559             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  560             dst1 = __lsx_vadd_h(temp2, p1_src);
 
  562             temp1 = __lsx_vslli_h(temp0, 1);
 
  563             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
 
  564             temp1 = __lsx_vsrari_h(temp1, 3);
 
  565             temp2 = __lsx_vsub_h(temp1, p0_src);
 
  566             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  567             dst2 = __lsx_vadd_h(temp2, p0_src);
 
  569             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  570             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
 
  571                       p_is_pcm_vec, dst0, dst1);
 
  572             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
 
  575             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
 
  577             temp1 = __lsx_vadd_h(q3_src, q2_src);
 
  578             temp1 = __lsx_vslli_h(temp1, 1);
 
  579             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
 
  580             temp1 = __lsx_vsrari_h(temp1, 3);
 
  581             temp2 = __lsx_vsub_h(temp1, q2_src);
 
  582             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  583             dst5 = __lsx_vadd_h(temp2, q2_src);
 
  585             temp1 = __lsx_vadd_h(temp0, q2_src);
 
  586             temp1 = __lsx_vsrari_h(temp1, 2);
 
  587             temp2 = __lsx_vsub_h(temp1, q1_src);
 
  588             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  589             dst4 = __lsx_vadd_h(temp2, q1_src);
 
  591             temp1 = __lsx_vslli_h(temp0, 1);
 
  592             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
 
  593             temp1 = __lsx_vsrari_h(temp1, 3);
 
  594             temp2 = __lsx_vsub_h(temp1, q0_src);
 
  595             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  596             dst3 = __lsx_vadd_h(temp2, q0_src);
 
  598             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  599             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
 
  600                       q_is_pcm_vec, dst3, dst4);
 
  601             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
 
  603         } 
else if (flag0 == flag1) { 
 
  605             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  606             tc_neg = __lsx_vneg_h(tc_pos);
 
  608             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
 
  610             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
 
  611                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
 
  612             delta0 = __lsx_vsub_h(diff0, diff1);
 
  613             delta0 = __lsx_vsrari_h(delta0, 4);
 
  614             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
 
  615                                  __lsx_vslli_h(tc_pos, 1));
 
  616             abs_delta0 = __lsx_vadda_h(delta0, 
zero);
 
  617             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
 
  618             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  620             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
 
  621             temp2 = __lsx_vadd_h(delta0, p0_src);
 
  622             temp2 = __lsx_vclip255_h(temp2);
 
  623             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  624             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
 
  626             temp2 = __lsx_vsub_h(q0_src, delta0);
 
  627             temp2 = __lsx_vclip255_h(temp2);
 
  628             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  629             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
 
  631             tmp = ((beta + (beta >> 1)) >> 3);
 
  632             DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < 
tmp),
 
  633                       !p_is_pcm4 && ((dp04 + dp34) < 
tmp), cmp0, cmp1);
 
  634             p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  635             p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
  637             DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < 
tmp),
 
  638                       (!q_is_pcm4) && (dq04 + dq34 < 
tmp), cmp0, cmp1);
 
  639             q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  640             q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
  641             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  642             tc_neg = __lsx_vneg_h(tc_pos);
 
  644             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
 
  646             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
 
  648             delta1 = __lsx_vadd_h(delta1, delta0);
 
  649             delta2 = __lsx_vsub_h(delta2, delta0);
 
  650             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
 
  651             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
 
  652                       tc_pos, delta1, delta2);
 
  653             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
 
  655             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
 
  656             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
 
  657                       q1_src, q_is_pcm_vec, delta1, delta2);
 
  659             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  660             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
 
  661                       p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
 
  662                       q1_src, abs_delta0, dst0, dst1, dst2, dst3);
 
  665             cmp3 = __lsx_vnor_v(cmp3, cmp3);
 
  666             DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
 
  667                       cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
 
  668                       dst0, dst1, dst2, dst3);
 
  669             DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
 
  672             dst4 = __lsx_vilvl_b(dst1, dst0);
 
  673             dst5 = __lsx_vilvh_b(dst1, dst0);
 
  674             dst0 = __lsx_vilvl_h(dst5, dst4);
 
  675             dst1 = __lsx_vilvh_h(dst5, dst4);
 
  678             __lsx_vstelm_w(dst0, 
src, 0, 0);
 
  679             __lsx_vstelm_w(dst0, 
src + 
stride, 0, 1);
 
  680             __lsx_vstelm_w(dst0, 
src + stride_2x, 0, 2);
 
  681             __lsx_vstelm_w(dst0, 
src + stride_3x, 0, 3);
 
  683             __lsx_vstelm_w(dst1, 
src, 0, 0);
 
  684             __lsx_vstelm_w(dst1, 
src + 
stride, 0, 1);
 
  685             __lsx_vstelm_w(dst1, 
src + stride_2x, 0, 2);
 
  686             __lsx_vstelm_w(dst1, 
src + stride_3x, 0, 3);
 
  690             tc_neg = __lsx_vneg_h(tc_pos);
 
  693             DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
 
  696             temp1 = __lsx_vadd_h(p3_src, p2_src);
 
  697             temp1 = __lsx_vslli_h(temp1, 1);
 
  698             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
 
  699             temp1 = __lsx_vsrari_h(temp1, 3);
 
  700             temp2 = __lsx_vsub_h(temp1, p2_src);
 
  701             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  702             dst0 = __lsx_vadd_h(temp2, p2_src);
 
  704             temp1 = __lsx_vadd_h(temp0, p2_src);
 
  705             temp1 = __lsx_vsrari_h(temp1, 2);
 
  706             temp2 = __lsx_vsub_h(temp1, p1_src);
 
  707             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  708             dst1 = __lsx_vadd_h(temp2, p1_src);
 
  710             temp1 = __lsx_vslli_h(temp0, 1);
 
  711             DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
 
  712             temp1 = __lsx_vsrari_h(temp1, 3);
 
  713             temp2 = __lsx_vsub_h(temp1, p0_src);
 
  714             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  715             dst2 = __lsx_vadd_h(temp2, p0_src);
 
  717             p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  718             DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
 
  719                       p_is_pcm_vec, dst0, dst1);
 
  720             dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
 
  723             DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
 
  724             temp1 = __lsx_vadd_h(q3_src, q2_src);
 
  725             temp1 = __lsx_vslli_h(temp1, 1);
 
  726             DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
 
  727             temp1 = __lsx_vsrari_h(temp1, 3);
 
  728             temp2 = __lsx_vsub_h(temp1, q2_src);
 
  729             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  730             dst5 = __lsx_vadd_h(temp2, q2_src);
 
  732             temp1 = __lsx_vadd_h(temp0, q2_src);
 
  733             temp1 = __lsx_vsrari_h(temp1, 2);
 
  734             temp2 = __lsx_vsub_h(temp1, q1_src);
 
  735             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  736             dst4 = __lsx_vadd_h(temp2, q1_src);
 
  738             temp1 = __lsx_vslli_h(temp0, 1);
 
  739             DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
 
  740             temp1 = __lsx_vsrari_h(temp1, 3);
 
  741             temp2 = __lsx_vsub_h(temp1, q0_src);
 
  742             temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
 
  743             dst3 = __lsx_vadd_h(temp2, q0_src);
 
  745             q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  746             DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
 
  747                       q_is_pcm_vec, dst3, dst4);
 
  748             dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
 
  752             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  753             tc_neg = __lsx_vneg_h(tc_pos);
 
  755             DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
 
  757             DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
 
  758                       __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
 
  759             delta0 = __lsx_vsub_h(diff0, diff1);
 
  760             delta0 = __lsx_vsrari_h(delta0, 4);
 
  762             temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
 
  763                     __lsx_vslli_h(tc_pos, 1));
 
  764             abs_delta0 = __lsx_vadda_h(delta0, 
zero);
 
  765             abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
 
  766             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  767             delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
 
  768             temp2 = __lsx_vadd_h(delta0, p0_src);
 
  769             temp2 = __lsx_vclip255_h(temp2);
 
  770             temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
 
  771             temp2 = __lsx_vsub_h(q0_src, delta0);
 
  772             temp2 = __lsx_vclip255_h(temp2);
 
  773             temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
 
  775             tmp = (beta + (beta >> 1)) >> 3;
 
  776             DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < 
tmp),
 
  777                       !p_is_pcm4 && ((dp04 + dp34) < 
tmp), cmp0, cmp1);
 
  778             p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  779             p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
  781             DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < 
tmp),
 
  782                       (!q_is_pcm4) && (dq04 + dq34 < 
tmp), cmp0, cmp1);
 
  783             q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  784             q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
  785             tc_pos = __lsx_vsrai_h(tc_pos, 1);
 
  786             tc_neg = __lsx_vneg_h(tc_pos);
 
  788             DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
 
  790             DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
 
  792             delta1 = __lsx_vadd_h(delta1, delta0);
 
  793             delta2 = __lsx_vsub_h(delta2, delta0);
 
  794             DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
 
  795             DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
 
  796                       tc_pos, delta1, delta2);
 
  797             DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
 
  799             DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
 
  800             DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
 
  801                       q1_src, q_is_pcm_vec, delta1, delta2);
 
  803             abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
 
  804             DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
 
  805                       q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
 
  806                       q0_src, abs_delta0, delta1, delta2, temp0, temp2);
 
  810             DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
 
  811                       cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
 
  812                       dst0, dst1, dst2, dst3);
 
  813             DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
 
  817         cmp3 = __lsx_vnor_v(cmp3, cmp3);
 
  818         DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
 
  819                   p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
 
  820         DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
 
  824         DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
 
  825                   dst5, dst0, dst1, dst2, dst3);
 
  828         DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
 
  829         DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
 
  830         DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
 
  831         DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
 
  834         __lsx_vstelm_w(dst0, 
src, 0, 0);
 
  835         __lsx_vstelm_h(dst2, 
src, 4, 0);
 
  837         __lsx_vstelm_w(dst0, 
src, 0, 1);
 
  838         __lsx_vstelm_h(dst2, 
src, 4, 2);
 
  841         __lsx_vstelm_w(dst0, 
src, 0, 2);
 
  842         __lsx_vstelm_h(dst2, 
src, 4, 4);
 
  844         __lsx_vstelm_w(dst0, 
src, 0, 3);
 
  845         __lsx_vstelm_h(dst2, 
src, 4, 6);
 
  848         __lsx_vstelm_w(dst1, 
src, 0, 0);
 
  849         __lsx_vstelm_h(dst3, 
src, 4, 0);
 
  851         __lsx_vstelm_w(dst1, 
src, 0, 1);
 
  852         __lsx_vstelm_h(dst3, 
src, 4, 2);
 
  855         __lsx_vstelm_w(dst1, 
src, 0, 2);
 
  856         __lsx_vstelm_h(dst3, 
src, 4, 4);
 
  858         __lsx_vstelm_w(dst1, 
src, 0, 3);
 
  859         __lsx_vstelm_h(dst3, 
src, 4, 6);
 
  864                                         const int32_t *tc, 
const uint8_t *p_is_pcm,
 
  865                                         const uint8_t *q_is_pcm)
 
  869     uint8_t *q0_ptr = 
src;
 
  871     __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
 
  872     __m128i p1, p0, 
q0, 
q1;
 
  873     __m128i tc_pos, tc_neg;
 
  875     __m128i temp0, temp1, 
delta;
 
  877     if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
 
  878         DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
 
  879         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
 
  880         tc_neg = __lsx_vneg_h(tc_pos);
 
  881         DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
 
  882         p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  883         p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
  885         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
 
  886         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  887         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
  889         DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
 
  894         temp0 = __lsx_vslli_h(temp0, 2);
 
  895         temp0 = __lsx_vadd_h(temp0, temp1);
 
  896         delta = __lsx_vsrari_h(temp0, 3);
 
  898         temp0 = __lsx_vadd_h(p0, 
delta);
 
  899         temp0 = __lsx_vclip255_h(temp0);
 
  900         p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
 
  901         temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
 
  903         temp1 = __lsx_vsub_h(
q0, 
delta);
 
  904         temp1 = __lsx_vclip255_h(temp1);
 
  905         q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
 
  906         temp1 = __lsx_vbitsel_v(temp1, 
q0, q_is_pcm_vec);
 
  908         tc_pos = __lsx_vslei_d(tc_pos, 0);
 
  909         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, 
q0, tc_pos,
 
  911         temp0 = __lsx_vpickev_b(temp1, temp0);
 
  912         __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
 
  913         __lsx_vstelm_d(temp0, p0_ptr + 
stride, 0, 1);
 
  918                                         const int32_t *tc, 
const uint8_t *p_is_pcm,
 
  919                                         const uint8_t *q_is_pcm)
 
  921     ptrdiff_t stride_2x = (
stride << 1);
 
  922     ptrdiff_t stride_4x = (
stride << 2);
 
  923     ptrdiff_t stride_3x = stride_2x + 
stride;
 
  924     __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
 
  926     __m128i p1, p0, 
q0, 
q1;
 
  927     __m128i tc_pos, tc_neg;
 
  929     __m128i temp0, temp1, 
delta;
 
  931     if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
 
  932         DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
 
  933         tc_pos = __lsx_vpackev_d(cmp1, cmp0);
 
  934         tc_neg = __lsx_vneg_h(tc_pos);
 
  936         DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
 
  937         p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  938         p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
 
  939         DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
 
  940         q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
 
  941         q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
 
  948                   src + stride_3x, 0, src4, src5, src6, src7);
 
  950         LSX_TRANSPOSE8x4_B(
src0, 
src1, 
src2, src3, src4, src5, src6, src7,
 
  956         temp0 = __lsx_vslli_h(temp0, 2);
 
  957         temp0 = __lsx_vadd_h(temp0, temp1);
 
  958         delta = __lsx_vsrari_h(temp0, 3);
 
  961         temp0 = __lsx_vadd_h(p0, 
delta);
 
  962         temp1 = __lsx_vsub_h(
q0, 
delta);
 
  963         DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
 
  964         DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
 
  965                   q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
 
  966         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, 
q0,
 
  967                   q_is_pcm_vec, temp0, temp1);
 
  969         tc_pos = __lsx_vslei_d(tc_pos, 0);
 
  970         DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, 
q0, tc_pos,
 
  972         temp0 = __lsx_vpackev_b(temp1, temp0);
 
  975         __lsx_vstelm_h(temp0, 
src, 0, 0);
 
  976         __lsx_vstelm_h(temp0, 
src + 
stride, 0, 1);
 
  977         __lsx_vstelm_h(temp0, 
src + stride_2x, 0, 2);
 
  978         __lsx_vstelm_h(temp0, 
src + stride_3x, 0, 3);
 
  980         __lsx_vstelm_h(temp0, 
src, 0, 4);
 
  981         __lsx_vstelm_h(temp0, 
src + 
stride, 0, 5);
 
  982         __lsx_vstelm_h(temp0, 
src + stride_2x, 0, 6);
 
  983         __lsx_vstelm_h(temp0, 
src + stride_3x, 0, 7);
 
  992                                                     const int16_t *sao_offset_val,
 
  995     const int32_t src_stride_2x = (src_stride << 1);
 
  996     const int32_t dst_stride_2x = (dst_stride << 1);
 
  997     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
  998     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
  999     __m128i edge_idx = {0x403000201, 0x0};
 
 1000     __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
 
 1001     __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1002     __m128i src_minus10, src_minus11, src_plus10, 
offset, 
src0, dst0;
 
 1003     __m128i const1 = __lsx_vldi(1);
 
 1006     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1010     DUP2_ARG2(__lsx_vld, 
src, 0, 
src + src_stride, 0, src_minus10, src_minus11);
 
 1013         src += src_stride_2x;
 
 1014         src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
 
 1015         src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
 
 1016         src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
 
 1019                   cmp_minus10, cmp_minus11);
 
 1020         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1021                   cmp_minus11, diff_minus10, diff_minus11);
 
 1023                   cmp_minus10, cmp_minus11);
 
 1024         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1025                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1026         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1027         diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1029         offset = __lsx_vadd_b(diff_minus10, diff_minus11);
 
 1034                   src_minus10, src_minus11);
 
 1039         dst0 = __lsx_vxori_b(dst0, 128);
 
 1041         __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1042         __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1043         dst += dst_stride_2x;
 
 1046     src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
 
 1047     src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
 
 1048     src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
 
 1052     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1053               diff_minus10, diff_minus11);
 
 1056     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1057               cmp_minus10, cmp_minus11);
 
 1058     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1059               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1061     offset = __lsx_vadd_b(diff_minus10, diff_minus11);
 
 1063     DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, 
offset, sao_offset, sao_offset,
 
 1067     dst0 = __lsx_vxori_b(dst0, 128);
 
 1069     __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1070     __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1077                                                     const int16_t *sao_offset_val,
 
 1080     const int32_t src_stride_2x = (src_stride << 1);
 
 1081     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1082     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 1083     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 1084     __m128i edge_idx = {0x403000201, 0x0};
 
 1085     __m128i const1 = __lsx_vldi(1);
 
 1086     __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
 
 1087     __m128i 
src0, 
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
 
 1088     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1089     __m128i zeros = {0};
 
 1091     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1095     DUP2_ARG2(__lsx_vld, 
src, 0, 
src + src_stride, 0, src_minus10, src_minus11);
 
 1098         src += src_stride_2x;
 
 1099         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
 
 1101         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
 
 1102                   src_minus11, shuf2, src_plus10, src_plus11);
 
 1103         DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
 
 1104                   src_plus10, src_minus10, src_plus10);
 
 1108                   cmp_minus10, cmp_minus11);
 
 1109         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1110                   cmp_minus11, diff_minus10, diff_minus11);
 
 1112                   cmp_minus10, cmp_minus11);
 
 1113         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1114                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1115         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1116         diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1118         offset = __lsx_vadd_b(diff_minus10, diff_minus11);
 
 1123                   src_minus10, src_minus11);
 
 1128         dst0 = __lsx_vxori_b(dst0, 128);
 
 1130         __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1131         __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1132         dst += dst_stride_2x;
 
 1135     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
 
 1137     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
 
 1138               shuf2, src_plus10, src_plus11);
 
 1139     DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
 
 1140               src_plus10, src_minus10, src_plus10);
 
 1145     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1146               diff_minus10, diff_minus11);
 
 1149     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1150               cmp_minus10, cmp_minus11);
 
 1151     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1152               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1154     offset = __lsx_vadd_b(diff_minus10, diff_minus11);
 
 1160     dst0 = __lsx_vxori_b(dst0, 128);
 
 1162     __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1163     __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1170                                                         const int16_t *sao_offset_val,
 
 1175     const uint8_t *src_minus1;
 
 1177     const int32_t src_stride_2x = (src_stride << 1);
 
 1178     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1179     const int32_t src_stride_4x = (src_stride << 2);
 
 1180     const int32_t dst_stride_4x = (dst_stride << 2);
 
 1181     const int32_t src_stride_3x = src_stride_2x + src_stride;
 
 1182     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
 
 1184     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 1185     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 1186     __m128i edge_idx = {0x403000201, 0x0};
 
 1187     __m128i const1 = __lsx_vldi(1);
 
 1189     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
 
 1190     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
 
 1191     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
 
 1192     __m128i diff_plus13;
 
 1193     __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
 
 1194     __m128i src_minus10, src_minus11, src_minus12, src_minus13;
 
 1195     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
 
 1196     __m128i src_zero0, src_zero1, src_zero2, src_zero3;
 
 1197     __m128i src_plus10, src_plus11, src_plus12, src_plus13;
 
 1199     sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1200     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1203         src_minus1 = 
src - 1;
 
 1204         src_minus10 = __lsx_vld(src_minus1, 0);
 
 1205         DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
 
 1206                   src_stride_2x, src_minus11, src_minus12);
 
 1207         src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
 
 1209         for (v_cnt = 0; v_cnt < 
width; v_cnt += 16) {
 
 1211             dst_ptr = 
dst + v_cnt;
 
 1212             src10 = __lsx_vld(src_minus1, 0);
 
 1213             DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
 
 1214                       src_stride_2x, src11, src12);
 
 1215             src13 = __lsx_vldx(src_minus1, src_stride_3x);
 
 1216             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
 
 1217                       src_minus11, shuf1, src12, src_minus12, shuf1, src13,
 
 1218                       src_minus13, shuf1, src_zero0, src_zero1,
 
 1219                       src_zero2, src_zero3);
 
 1220             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
 
 1221                       src_minus11, shuf2, src12, src_minus12, shuf2, src13,
 
 1222                       src_minus13, shuf2, src_plus10, src_plus11,
 
 1223                       src_plus12, src_plus13);
 
 1224             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
 
 1225                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
 
 1226                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
 
 1227             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
 
 1228                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
 
 1229                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
 
 1230             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1231                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1232                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
 
 1234             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1235                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1236                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
 
 1238             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
 
 1239                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
 
 1240                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
 
 1241             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
 
 1242                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
 
 1243                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
 
 1244             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1245                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1246                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 1248             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1249                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1250                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 1252             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1253                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
 
 1254                       cmp_minus11, diff_plus11, const1, cmp_plus11,
 
 1255                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
 
 1256             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
 
 1257                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
 
 1258                       cmp_minus13, diff_plus13, const1, cmp_plus13,
 
 1259                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
 
 1261             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
 
 1262                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
 
 1263                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
 
 1265             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
 
 1266                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
 
 1267                       offset_mask1, offset_mask2, offset_mask3);
 
 1268             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
 
 1269                       sao_offset, sao_offset, offset_mask0, offset_mask0,
 
 1271             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
 
 1272                       sao_offset, sao_offset, offset_mask1, offset_mask1,
 
 1274             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
 
 1275                       sao_offset, sao_offset, offset_mask2, offset_mask2,
 
 1277             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
 
 1278                       sao_offset, sao_offset, offset_mask3, offset_mask3,
 
 1281             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
 
 1282                       src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
 
 1283                       src_zero2, src_zero3);
 
 1284             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
 
 1285                       offset_mask1, src_zero2, offset_mask2, src_zero3,
 
 1286                       offset_mask3, dst0, dst1, dst2, dst3);
 
 1287             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
 
 1288                       128, dst0, dst1, dst2, dst3);
 
 1290             src_minus10 = src10;
 
 1291             src_minus11 = src11;
 
 1292             src_minus12 = src12;
 
 1293             src_minus13 = src13;
 
 1295             __lsx_vst(dst0, dst_ptr, 0);
 
 1296             __lsx_vst(dst1, dst_ptr + dst_stride, 0);
 
 1297             __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
 
 1298             __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
 
 1300         src += src_stride_4x;
 
 1301         dst += dst_stride_4x;
 
 1309                                                      const int16_t *sao_offset_val,
 
 1312     const int32_t src_stride_2x = (src_stride << 1);
 
 1313     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1314     __m128i edge_idx = {0x403000201, 0x0};
 
 1315     __m128i const1 = __lsx_vldi(1);
 
 1317     __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1318     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 
 1319     __m128i src_minus10, src_minus11, src10, src11;
 
 1320     __m128i src_zero0, src_zero1;
 
 1322     __m128i offset_mask0, offset_mask1;
 
 1324     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1328               src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
 
 1331         src += src_stride_2x;
 
 1332         DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
 
 1333                   src11, src_minus11, src10, src10, src_minus10, src_zero0,
 
 1334                   src_minus11, src_zero1);
 
 1335         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1336                   cmp_minus10, cmp_minus11);
 
 1337         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1338                   cmp_minus11, diff_minus10, diff_minus11);
 
 1339         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 1340                   src_minus11, cmp_minus10, cmp_minus11);
 
 1341         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1342                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1343         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1344                  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1346         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1347                   diff_minus11, offset_mask0, offset_mask1);
 
 1348         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 1349                   offset_mask0, offset_mask1);
 
 1350         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1351                   src_zero0, 
offset, dst0);
 
 1355         dst0 = __lsx_vxori_b(dst0, 128);
 
 1356         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1357         dst0 = __lsx_vxori_b(dst0, 128);
 
 1358         src_minus10 = src10;
 
 1359         src_minus11 = src11;
 
 1365         __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1366         __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1367         dst += dst_stride_2x;
 
 1370     DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
 
 1371               src11,  src_minus11, src10, src10, src_minus10, src_zero0,
 
 1372               src_minus11, src_zero1);
 
 1373     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1374               cmp_minus10, cmp_minus11);
 
 1375     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1376               diff_minus10, diff_minus11);
 
 1377     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1378               cmp_minus10, cmp_minus11);
 
 1379     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1380               cmp_minus10, cmp_minus11);
 
 1381     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1382               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1384     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1385               diff_minus11, offset_mask0, offset_mask1);
 
 1386     DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
 
 1387               offset_mask0, offset_mask1);
 
 1388     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1389               src_zero0, 
offset, dst0);
 
 1392     dst0 = __lsx_vxori_b(dst0, 128);
 
 1393     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1394     dst0 = __lsx_vxori_b(dst0, 128);
 
 1396     __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1397     __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1404                                                      const int16_t *sao_offset_val,
 
 1407     const int32_t src_stride_2x = (src_stride << 1);
 
 1408     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1409     __m128i edge_idx = {0x403000201, 0x0};
 
 1410     __m128i const1 = __lsx_vldi(1);
 
 1411     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1412     __m128i src_zero0, src_zero1, dst0;
 
 1413     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 
 1414     __m128i src_minus10, src_minus11, src10, src11;
 
 1415     __m128i offset_mask0, offset_mask1;
 
 1417     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1420     DUP2_ARG2(__lsx_vld, 
src - src_stride, 0, 
src, 0, src_minus10, src_minus11);
 
 1421     DUP2_ARG2(__lsx_vldx, 
src, src_stride, 
src, src_stride_2x, src10, src11);
 
 1424         src += src_stride_2x;
 
 1425         DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
 
 1426                   src11, src_minus11, src10, src10, src_minus10, src_zero0,
 
 1427                   src_minus11, src_zero1);
 
 1428         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1429                   cmp_minus10, cmp_minus11);
 
 1430         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1431                   cmp_minus11, diff_minus10, diff_minus11);
 
 1432         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 1433                   src_minus11, cmp_minus10, cmp_minus11);
 
 1434         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1435                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1436         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1437                 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1439         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1440                   diff_minus11, offset_mask0, offset_mask1);
 
 1441         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 1442                   offset_mask0, offset_mask1);
 
 1443         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1444                   src_zero0, 
offset, dst0);
 
 1448         dst0 = __lsx_vxori_b(dst0, 128);
 
 1449         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1450         dst0 = __lsx_vxori_b(dst0, 128);
 
 1451         src_minus10 = src10;
 
 1452         src_minus11 = src11;
 
 1458         __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1459         __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1460         dst += dst_stride_2x;
 
 1463     DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
 
 1464               src11, src_minus11, src10, src10, src_minus10, src_zero0,
 
 1465               src_minus11, src_zero1);
 
 1466     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1467               cmp_minus10, cmp_minus11);
 
 1468     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1469               diff_minus10, diff_minus11);
 
 1470     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1471               cmp_minus10, cmp_minus11);
 
 1472     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1473               cmp_minus10, cmp_minus11);
 
 1474     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1475               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1477     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1478               diff_minus11, offset_mask0, offset_mask1);
 
 1479     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 1480               offset_mask0, offset_mask1);
 
 1481     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1482               src_zero0, 
offset, dst0);
 
 1485     dst0 =  __lsx_vxori_b(dst0, 128);
 
 1486     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1487     dst0 = __lsx_vxori_b(dst0, 128);
 
 1489     __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1490     __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1502     const uint8_t *src_orig = 
src;
 
 1503     uint8_t *dst_orig = 
dst;
 
 1505     const int32_t src_stride_2x = (src_stride << 1);
 
 1506     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1507     const int32_t src_stride_4x = (src_stride << 2);
 
 1508     const int32_t dst_stride_4x = (dst_stride << 2);
 
 1509     const int32_t src_stride_3x = src_stride_2x + src_stride;
 
 1510     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
 
 1511     __m128i edge_idx = {0x403000201, 0x0};
 
 1512     __m128i const1 = __lsx_vldi(1);
 
 1513     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
 
 1514     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
 
 1515     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
 
 1516     __m128i diff_plus13;
 
 1517     __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
 
 1518     __m128i src12, dst2, src13, dst3;
 
 1519     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
 
 1521     sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1522     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1524     for (v_cnt = 0; v_cnt < 
width; v_cnt += 16) {
 
 1525         src = src_orig + v_cnt;
 
 1526         dst = dst_orig + v_cnt;
 
 1529                   src_minus10, src_minus11);
 
 1531         for (h_cnt = (
height >> 2); h_cnt--;) {
 
 1533                       src, src_stride_3x, 
src, src_stride_4x,
 
 1534                       src10, src11, src12, src13);
 
 1535             DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
 
 1536                       src10, src10, src_minus11, src10, src11, cmp_minus10,
 
 1537                       cmp_plus10, cmp_minus11, cmp_plus11);
 
 1538             DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
 
 1539                       src12, src13, cmp_minus12, cmp_plus12,
 
 1540                       cmp_minus13, cmp_plus13);
 
 1541             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1542                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1543                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
 
 1545             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1546                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1547                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
 
 1549             DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
 
 1550                       src10, src10, src_minus11, src10, src11, cmp_minus10,
 
 1551                       cmp_plus10, cmp_minus11, cmp_plus11);
 
 1552             DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
 
 1553                       src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 1555             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1556                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1557                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 1559             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1560                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1561                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 1563             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1564                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
 
 1565                       cmp_minus11, diff_plus11, const1, cmp_plus11,
 
 1566                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
 
 1567             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
 
 1568                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
 
 1569                       cmp_minus13, diff_plus13, const1, cmp_plus13,
 
 1570                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
 
 1572             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
 
 1573                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
 
 1574                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
 
 1576             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
 
 1577                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
 
 1578                       offset_mask1, offset_mask2, offset_mask3);
 
 1579             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
 
 1580                       sao_offset, sao_offset, offset_mask0,\
 
 1581                       offset_mask0, offset_mask0);
 
 1582             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
 
 1583                       sao_offset, sao_offset, offset_mask1, offset_mask1,
 
 1585             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
 
 1586                       sao_offset, sao_offset, offset_mask2, offset_mask2,
 
 1588             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
 
 1589                       sao_offset, sao_offset, offset_mask3, offset_mask3,
 
 1592             src_minus10 = src12;
 
 1593             DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
 
 1594                       src12, 128, src_minus11, src10, src11, src12);
 
 1595             DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
 
 1596                       offset_mask1, src11, offset_mask2, src12,
 
 1597                       offset_mask3, dst0, dst1, dst2, dst3);
 
 1598             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
 
 1599                       128, dst0, dst1, dst2, dst3);
 
 1600             src_minus11 = src13;
 
 1602             __lsx_vst(dst0, 
dst, 0);
 
 1603             __lsx_vstx(dst1, 
dst, dst_stride);
 
 1604             __lsx_vstx(dst2, 
dst, dst_stride_2x);
 
 1605             __lsx_vstx(dst3, 
dst, dst_stride_3x);
 
 1606             src += src_stride_4x;
 
 1607             dst += dst_stride_4x;
 
 1616                                                      const int16_t *sao_offset_val,
 
 1619     const uint8_t *src_orig;
 
 1620     const int32_t src_stride_2x = (src_stride << 1);
 
 1621     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1622     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 1623     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 1624     __m128i edge_idx = {0x403000201, 0x0};
 
 1625     __m128i const1 = __lsx_vldi(1);
 
 1626     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1627     __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
 
 1628     __m128i src_minus11, src10, src11;
 
 1629     __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
 
 1630     __m128i offset_mask0, offset_mask1;
 
 1631     __m128i zeros = {0};
 
 1633     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1637     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
 
 1638               src_minus10, src_minus11);
 
 1639     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1643         src_orig += src_stride_2x;
 
 1645         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
 
 1646                   shuf1, src_zero0, src_zero1);
 
 1647         DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
 
 1648                   src_plus0, src_plus1);
 
 1650         DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
 
 1651                   src_minus11, src_minus10, src_minus11);
 
 1652         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
 
 1653                   src_zero1, src_zero0, src_zero1);
 
 1654         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
 
 1655                   src_minus11, cmp_minus10, cmp_minus11);
 
 1656         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1657                   cmp_minus11, diff_minus10, diff_minus11);
 
 1658         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 1659                   src_minus11, cmp_minus10, cmp_minus11);
 
 1660         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1661                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1662         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1663              diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1665         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1666                   diff_minus11, offset_mask0, offset_mask1);
 
 1667         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 1668                   offset_mask0, offset_mask1);
 
 1669         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1670                   src_zero0, 
offset, dst0);
 
 1673         dst0 = __lsx_vxori_b(dst0, 128);
 
 1674         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1675         dst0 = __lsx_vxori_b(dst0, 128);
 
 1677         src_minus10 = src10;
 
 1678         src_minus11 = src11;
 
 1681         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1684         __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1685         __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1686         dst += dst_stride_2x;
 
 1689     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
 
 1690               src_zero0, src_zero1);
 
 1691     DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
 
 1692               src_plus0, src_plus1);
 
 1694     DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
 
 1695               src_minus10, src_minus11);
 
 1696     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 1697               src_zero0, src_zero1);
 
 1698     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1699               cmp_minus10, cmp_minus11);
 
 1700     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1701               diff_minus10, diff_minus11);
 
 1702     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1703               cmp_minus10, cmp_minus11);
 
 1704     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1705               cmp_minus10, cmp_minus11);
 
 1706     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1707               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1709     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1710               diff_minus11, offset_mask0, offset_mask1);
 
 1711     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
 
 1713     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1714               src_zero0, 
offset, dst0);
 
 1717     dst0 = __lsx_vxori_b(dst0, 128);
 
 1718     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1719     dst0 = __lsx_vxori_b(dst0, 128);
 
 1721     __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 1722     __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 1729                                                      const int16_t *sao_offset_val,
 
 1732     const uint8_t *src_orig;
 
 1733     const int32_t src_stride_2x = (src_stride << 1);
 
 1734     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1735     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 1736     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 1737     __m128i edge_idx = {0x403000201, 0x0};
 
 1738     __m128i const1 = __lsx_vldi(1);
 
 1739     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1740     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 
 1741     __m128i src_minus10, src10, src_minus11, src11;
 
 1742     __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
 
 1743     __m128i offset_mask0, offset_mask1;
 
 1744     __m128i zeros = {0};
 
 1746     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1750     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
 
 1752     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1756         src_orig += src_stride_2x;
 
 1758         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
 
 1759                   shuf1, src_zero0, src_zero1);
 
 1760         DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
 
 1761                   src_plus10, src_plus11);
 
 1763         DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
 
 1764                   src_minus11, src_minus10, src_minus11);
 
 1765         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 1766                   src_zero0, src_zero1);
 
 1767         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1768                   cmp_minus10, cmp_minus11);
 
 1769         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1770                   cmp_minus11, diff_minus10, diff_minus11);
 
 1771         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 1772                   src_minus11, cmp_minus10, cmp_minus11);
 
 1773         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1774                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 1775         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1776                diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
 
 1778         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1779                   diff_minus11, offset_mask0, offset_mask1);
 
 1780         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 1781                   offset_mask0, offset_mask1);
 
 1782         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1783                   src_zero0, 
offset, dst0);
 
 1786         dst0 = __lsx_vxori_b(dst0, 128);
 
 1787         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1788         dst0 = __lsx_vxori_b(dst0, 128);
 
 1790         src_minus10 = src10;
 
 1791         src_minus11 = src11;
 
 1794         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1796         __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1797         __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1798         dst += dst_stride_2x;
 
 1801     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
 
 1802               src_zero0, src_zero1);
 
 1803     DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
 
 1804               src_plus10, src_plus11);
 
 1805     DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
 
 1806               src_minus10, src_minus11);
 
 1807     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 1808               src_zero0, src_zero1);
 
 1810     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1811               cmp_minus10, cmp_minus11);
 
 1812     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 1813               cmp_minus11, diff_minus10, diff_minus11);
 
 1814     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 1815               cmp_minus10, cmp_minus11);
 
 1816     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 1817               cmp_minus10, cmp_minus11);
 
 1818     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 1819               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 1821     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 1822               diff_minus11, offset_mask0, offset_mask1);
 
 1823     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
 
 1825     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 1826               src_zero0, 
offset, dst0);
 
 1829     dst0 = __lsx_vxori_b(dst0, 128);
 
 1830     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 1831     dst0 = __lsx_vxori_b(dst0, 128);
 
 1833     src_minus10 = src10;
 
 1834     src_minus11 = src11;
 
 1837     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1840     __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 1841     __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 1853     const uint8_t *src_orig = 
src;
 
 1854     uint8_t *dst_orig = 
dst;
 
 1856     const int32_t src_stride_2x = (src_stride << 1);
 
 1857     const int32_t dst_stride_2x = (dst_stride << 1);
 
 1858     const int32_t src_stride_4x = (src_stride << 2);
 
 1859     const int32_t dst_stride_4x = (dst_stride << 2);
 
 1860     const int32_t src_stride_3x = src_stride_2x + src_stride;
 
 1861     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
 
 1863     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 1864     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 1865     __m128i edge_idx = {0x403000201, 0x0};
 
 1866     __m128i const1 = __lsx_vldi(1);
 
 1867     __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
 
 1868     __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
 
 1869     __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
 
 1870     __m128i diff_plus13, src_minus14, src_plus13;
 
 1871     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
 
 1872     __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
 
 1873     __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
 
 1874     __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
 
 1875     __m128i src_zero3, sao_offset, src_plus12;
 
 1877     sao_offset = __lsx_vld(sao_offset_val, 0);
 
 1878     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 1883         src_minus11 = __lsx_vld(src_orig, 0);
 
 1884         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 1885                   src_minus12, src_minus13);
 
 1886         src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
 
 1888         for (v_cnt = 0; v_cnt < 
width; v_cnt += 16) {
 
 1889             src_minus10 = __lsx_vld(src_orig - src_stride, 0);
 
 1891             src10 = __lsx_vld(src_orig, 0);
 
 1892             DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
 
 1893                       src_stride_2x, src11, src12);
 
 1894             src13 = __lsx_vldx(src_orig, src_stride_3x);
 
 1895             src_plus13 = __lsx_vld(
src + v_cnt + src_stride_4x, 1);
 
 1897             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
 
 1898                       src_minus12, shuf1, src12, src_minus13, shuf1,
 
 1899                       src13, src_minus14, shuf1, src_zero0, src_zero1,
 
 1900                       src_zero2, src_zero3);
 
 1901             DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
 
 1902                       src_minus13, shuf2, src_plus10, src_plus11);
 
 1903             src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
 
 1905             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
 
 1906                       src_plus10, src_zero1, src_minus11, src_zero1,
 
 1907                       src_plus11, cmp_minus10, cmp_plus10,
 
 1908                       cmp_minus11, cmp_plus11);
 
 1909             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
 
 1910                       src_plus12, src_zero3, src_minus13, src_zero3,
 
 1911                       src_plus13, cmp_minus12, cmp_plus12,
 
 1912                       cmp_minus13, cmp_plus13);
 
 1913             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1914                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1915                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
 
 1917             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1918                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1919                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
 
 1921             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
 
 1922                       src_plus10, src_zero1, src_minus11, src_zero1,
 
 1923                       src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 1925             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
 
 1926                       src_plus12, src_zero3, src_minus13, src_zero3,
 
 1927                       src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 1929             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 1930                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 1931                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 1933             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 1934                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 1935                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 1937             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 1938                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
 
 1939                       cmp_minus11, diff_plus11, const1, cmp_plus11,
 
 1940                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
 
 1941             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
 
 1942                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
 
 1943                       cmp_minus13, diff_plus13, const1, cmp_plus13,
 
 1944                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
 
 1946             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
 
 1947                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
 
 1948                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
 
 1950             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
 
 1951                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
 
 1952                       offset_mask1, offset_mask2, offset_mask3);
 
 1954             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
 
 1955                       sao_offset, sao_offset, offset_mask0, offset_mask0,
 
 1957             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
 
 1958                       sao_offset, sao_offset, offset_mask1, offset_mask1,
 
 1960             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
 
 1961                       sao_offset, sao_offset, offset_mask2, offset_mask2,
 
 1963             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
 
 1964                       sao_offset, sao_offset, offset_mask3, offset_mask3,
 
 1967             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
 
 1968                       128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
 
 1970             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
 
 1971                       offset_mask1, src_zero2, offset_mask2, src_zero3,
 
 1972                       offset_mask3, dst0, dst1, dst2, dst3);
 
 1973             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
 
 1974                       128, dst0, dst1, dst2, dst3);
 
 1976             src_minus11 = src10;
 
 1977             src_minus12 = src11;
 
 1978             src_minus13 = src12;
 
 1979             src_minus14 = src13;
 
 1981             __lsx_vst(dst0, dst_orig, 0);
 
 1982             __lsx_vstx(dst1, dst_orig, dst_stride);
 
 1983             __lsx_vstx(dst2, dst_orig, dst_stride_2x);
 
 1984             __lsx_vstx(dst3, dst_orig, dst_stride_3x);
 
 1987         src += src_stride_4x;
 
 1988         dst += dst_stride_4x;
 
 1996                                                       const int16_t *sao_offset_val,
 
 1999     const uint8_t *src_orig;
 
 2000     const int32_t src_stride_2x = (src_stride << 1);
 
 2001     const int32_t dst_stride_2x = (dst_stride << 1);
 
 2003     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 2004     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 2005     __m128i edge_idx = {0x403000201, 0x0};
 
 2006     __m128i const1 = __lsx_vldi(1);
 
 2007     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 2008     __m128i src_zero0, src_zero1, dst0;
 
 2009     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 
 2010     __m128i src_minus10, src10, src_minus11, src11;
 
 2011     __m128i offset_mask0, offset_mask1;
 
 2012     __m128i zeros = {0};
 
 2014     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 2018     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
 
 2019               src_minus10, src_minus11);
 
 2020     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2024         src_orig += src_stride_2x;
 
 2026         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
 
 2027                   shuf1, src_zero0, src_zero1);
 
 2028         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
 
 2029                   shuf2, src_minus10, src_minus11);
 
 2031         DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
 
 2032                   src_minus10, src_minus11);
 
 2033         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 2034                   src_zero0, src_zero1);
 
 2035         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2036                   cmp_minus10, cmp_minus11);
 
 2037         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 2038                   cmp_minus11, diff_minus10, diff_minus11);
 
 2039         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 2040                   src_minus11, cmp_minus10, cmp_minus11);
 
 2041         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 2042                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 2043         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 2044                diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
 
 2046         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 2047                   diff_minus11, offset_mask0, offset_mask1);
 
 2048         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 2049                   offset_mask0, offset_mask1);
 
 2050         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 2051                   src_zero0, 
offset, dst0);
 
 2054         dst0 = __lsx_vxori_b(dst0, 128);
 
 2055         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 2056         dst0 = __lsx_vxori_b(dst0, 128);
 
 2058         src_minus10 = src10;
 
 2059         src_minus11 = src11;
 
 2062         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2065         __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 2066         __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 2067         dst += dst_stride_2x;
 
 2070     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
 
 2071               src_zero0, src_zero1);
 
 2072     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
 
 2073               shuf2, src_minus10, src_minus11);
 
 2075     DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
 
 2076               src_minus10, src_minus11);
 
 2077     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 2078               src_zero0, src_zero1);
 
 2079     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2080               cmp_minus10, cmp_minus11);
 
 2081     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 2082               cmp_minus11, diff_minus10, diff_minus11);
 
 2083     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2084               cmp_minus10, cmp_minus11);
 
 2085     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 2086               cmp_minus10, cmp_minus11);
 
 2087     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 2088               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 2090     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 2091               diff_minus11, offset_mask0, offset_mask1);
 
 2092     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
 
 2094     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 2095               src_zero0, 
offset, dst0);
 
 2098     dst0 = __lsx_vxori_b(dst0, 128);
 
 2099     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 2100     dst0 = __lsx_vxori_b(dst0, 128);
 
 2102     __lsx_vstelm_w(dst0, 
dst, 0, 0);
 
 2103     __lsx_vstelm_w(dst0, 
dst + dst_stride, 0, 2);
 
 2104     dst += dst_stride_2x;
 
 2111                                                       const int16_t *sao_offset_val,
 
 2114     const uint8_t *src_orig;
 
 2115     const int32_t src_stride_2x = (src_stride << 1);
 
 2116     const int32_t dst_stride_2x = (dst_stride << 1);
 
 2118     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 2119     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 2120     __m128i edge_idx = {0x403000201, 0x0};
 
 2121     __m128i const1 = __lsx_vldi(1);
 
 2122     __m128i 
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
 
 2123     __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
 
 2124     __m128i src_minus10, src10, src_minus11, src11;
 
 2125     __m128i src_zero0, src_zero1, dst0;
 
 2126     __m128i offset_mask0, offset_mask1;
 
 2127     __m128i zeros = {0};
 
 2129     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 2133     DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
 
 2134               src_minus10, src_minus11);
 
 2135     DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2139         src_orig += src_stride_2x;
 
 2141         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
 
 2142                   shuf1, src_zero0, src_zero1);
 
 2143         DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
 
 2144                   shuf2, src_minus10, src_minus11);
 
 2146         DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
 
 2147                   src_minus10, src_minus11);
 
 2148         DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 2149                   src_zero0, src_zero1);
 
 2150         DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2151                   cmp_minus10, cmp_minus11);
 
 2152         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 2153                   cmp_minus11, diff_minus10, diff_minus11);
 
 2154         DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
 
 2155                   src_minus11, cmp_minus10, cmp_minus11);
 
 2156         DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
 
 2157                   cmp_minus11, cmp_minus10, cmp_minus11);
 
 2158         DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 2159               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
 
 2161         DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 2162                   diff_minus11, offset_mask0, offset_mask1);
 
 2163         DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
 
 2164                   offset_mask0, offset_mask1);
 
 2165         DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 2166                   src_zero0, 
offset, dst0);
 
 2169         dst0 = __lsx_vxori_b(dst0, 128);
 
 2170         dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 2171         dst0 = __lsx_vxori_b(dst0, 128);
 
 2173         src_minus10 = src10;
 
 2174         src_minus11 = src11;
 
 2177         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2180         __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 2181         __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 2182         dst += dst_stride_2x;
 
 2185     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
 
 2186               src_zero0, src_zero1);
 
 2187     DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
 
 2188               shuf2, src_minus10, src_minus11);
 
 2190     DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
 
 2191               src_minus10, src_minus11);
 
 2192     DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
 
 2193               src_zero0, src_zero1);
 
 2194     DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2195               cmp_minus10, cmp_minus11);
 
 2196     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 2197               diff_minus10, diff_minus11);
 
 2198     DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
 
 2199               cmp_minus10, cmp_minus11);
 
 2200     DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
 
 2201               cmp_minus10, cmp_minus11);
 
 2202     DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
 
 2203               const1, cmp_minus11, diff_minus10, diff_minus11);
 
 2205     DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
 
 2206               diff_minus11, offset_mask0, offset_mask1);
 
 2207     DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
 
 2209     DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
 
 2210               src_zero0, 
offset, dst0);
 
 2213     dst0 = __lsx_vxori_b(dst0, 128);
 
 2214     dst0 = __lsx_vsadd_b(dst0, 
offset);
 
 2215     dst0 = __lsx_vxori_b(dst0, 128);
 
 2217     __lsx_vstelm_d(dst0, 
dst, 0, 0);
 
 2218     __lsx_vstelm_d(dst0, 
dst + dst_stride, 0, 1);
 
 2225                                                           const int16_t *sao_offset_val,
 
 2229     const uint8_t *src_orig;
 
 2232     const int32_t src_stride_2x = (src_stride << 1);
 
 2233     const int32_t dst_stride_2x = (dst_stride << 1);
 
 2234     const int32_t src_stride_4x = (src_stride << 2);
 
 2235     const int32_t dst_stride_4x = (dst_stride << 2);
 
 2236     const int32_t src_stride_3x = src_stride_2x + src_stride;
 
 2237     const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
 
 2239     __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
 
 2240     __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
 
 2241     __m128i edge_idx = {0x403000201, 0x0};
 
 2242     __m128i const1 = __lsx_vldi(1);
 
 2243     __m128i dst0, dst1, dst2, dst3;
 
 2244     __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
 
 2245     __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
 
 2246     __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
 
 2247     __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
 
 2248     __m128i src_plus10, src_plus11, src_plus12, src_plus13;
 
 2249     __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
 
 2250     __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
 
 2252     sao_offset = __lsx_vld(sao_offset_val, 0);
 
 2253     sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
 
 2259         src_minus11 = __lsx_vld(src_orig, 0);
 
 2260         DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2261                   src_plus10, src_plus11);
 
 2262         src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
 
 2264         for (v_cnt = 0; v_cnt < 
width; v_cnt += 16) {
 
 2265             src_minus10 = __lsx_vld(src_orig - src_stride, 2);
 
 2266             src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
 
 2268             src10 = __lsx_vld(src_orig, 0);
 
 2269             DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
 
 2271             src13 =__lsx_vldx(src_orig, src_stride_3x);
 
 2273             DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
 
 2274                       src_plus10,  shuf1, src12, src_plus11, shuf1, src13,
 
 2275                       src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
 
 2277             src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
 
 2278             DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
 
 2279                       src_plus11, shuf2, src_minus12, src_minus13);
 
 2281             DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
 
 2282                       src_plus10,  src_zero1, src_minus11, src_zero1,
 
 2283                       src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 2285             DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
 
 2286                       src_plus12, src_zero3, src_minus13, src_zero3,
 
 2287                       src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 2289             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 2290                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 2291                       cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
 
 2293             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 2294                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 2295                       cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
 
 2297             DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
 
 2298                       src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
 
 2299                       cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
 
 2300             DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
 
 2301                       src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
 
 2302                       cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
 
 2303             DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
 
 2304                       cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
 
 2305                       cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
 
 2307             DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
 
 2308                       cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
 
 2309                       cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
 
 2311             DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
 
 2312                       diff_plus10, const1, cmp_plus10, diff_minus11, const1,
 
 2313                       cmp_minus11, diff_plus11, const1, cmp_plus11,
 
 2314                       diff_minus10, diff_plus10, diff_minus11, diff_plus11);
 
 2315             DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
 
 2316                       diff_plus12, const1, cmp_plus12, diff_minus13, const1,
 
 2317                       cmp_minus13, diff_plus13, const1, cmp_plus13,
 
 2318                       diff_minus12, diff_plus12, diff_minus13, diff_plus13);
 
 2320             DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
 
 2321                       diff_plus11, diff_minus12, diff_plus12, diff_minus13,
 
 2322                       diff_plus13, offset_mask0, offset_mask1, offset_mask2,
 
 2324             DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
 
 2325                       offset_mask2, 2, offset_mask3, 2, offset_mask0,
 
 2326                       offset_mask1, offset_mask2, offset_mask3);
 
 2328             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
 
 2329                       sao_offset, sao_offset, offset_mask0, offset_mask0,
 
 2331             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
 
 2332                       sao_offset, sao_offset, offset_mask1, offset_mask1,
 
 2334             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
 
 2335                       sao_offset, sao_offset, offset_mask2, offset_mask2,
 
 2337             DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
 
 2338                       sao_offset, sao_offset, offset_mask3, offset_mask3,
 
 2341             DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
 
 2342                       src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
 
 2343                       src_zero2, src_zero3);
 
 2344             DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
 
 2345                       offset_mask1, src_zero2, offset_mask2, src_zero3,
 
 2346                       offset_mask3, dst0, dst1, dst2, dst3);
 
 2347             DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
 
 2348                       128, dst0, dst1, dst2, dst3);
 
 2350             src_minus11 = src10;
 
 2355             __lsx_vst(dst0, dst_orig, 0);
 
 2356             __lsx_vstx(dst1, dst_orig, dst_stride);
 
 2357             __lsx_vstx(dst2, dst_orig, dst_stride_2x);
 
 2358             __lsx_vstx(dst3, dst_orig, dst_stride_3x);
 
 2362         src += src_stride_4x;
 
 2363         dst += dst_stride_4x;
 
 2368                                    ptrdiff_t stride_dst,
 
 2369                                    const int16_t *sao_offset_val,