26                           const int16_t **
src, uint8_t *dest, 
int dstW,
 
   31     __m256i 
mask = {0x1C0C180814041000, 0x1C1814100C080400,
 
   32                     0x1C0C180814041000, 0x1C1814100C080400};
 
   33     __m256i val1, val2, val3;
 
   42     int val_1[8] = {dither0, dither2, dither4, dither6,
 
   43                     dither0, dither2, dither4, dither6};
 
   44     int val_2[8] = {dither1, dither3, dither5, dither7,
 
   45                     dither1, dither3, dither5, dither7};
 
   46     int val_3[8] = {dither0, dither1, dither2, dither3,
 
   47                     dither4, dither5, dither6, dither7};
 
   49     DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
 
   50     val3 = __lasx_xvld(val_3, 0);
 
   52     for (
i = 0; 
i < 
len; 
i += 16) {
 
   55         __m256i val_ev, val_od;
 
   57         val_ev = __lasx_xvslli_w(val1, 12);
 
   58         val_od = __lasx_xvslli_w(val2, 12);
 
   60         for (j = 0; j < filterSize; j++) {
 
   63             val_ev = __lasx_xvmaddwev_w_h(val_ev, 
src0, 
filter0);
 
   64             val_od = __lasx_xvmaddwod_w_h(val_od, 
src0, 
filter0);
 
   66         val_ev = __lasx_xvsrai_w(val_ev, 19);
 
   67         val_od = __lasx_xvsrai_w(val_od, 19);
 
   68         val_ev = __lasx_xvclip255_w(val_ev);
 
   69         val_od = __lasx_xvclip255_w(val_od);
 
   70         val    = __lasx_xvshuf_b(val_od, val_ev, 
mask);
 
   71         __lasx_xvstelm_d(
val, (dest + 
i), 0, 0);
 
   72         __lasx_xvstelm_d(
val, (dest + 
i), 8, 2);
 
   79         val_l = __lasx_xvslli_w(val3, 12);
 
   81         for (j = 0; j < filterSize; j++) {
 
   88         val_l = __lasx_xvsrai_w(val_l, 19);
 
   89         val_l = __lasx_xvclip255_w(val_l);
 
   90         val_h = __lasx_xvpermi_d(val_l, 0x4E);
 
   91         val_l = __lasx_xvshuf_b(val_h, val_l, 
mask);
 
   92         __lasx_xvstelm_d(val_l, (dest + 
i), 0, 1);
 
   95     for (; 
i < dstW; 
i++) {
 
   98         for (j = 0; j< filterSize; j++)
 
  108               unsigned A1, 
unsigned A2,
 
  109               const void *_r, 
const void *_g, 
const void *_b, 
int y,
 
  114         uint32_t *dest = (uint32_t *) _dest;
 
  115         const uint32_t *
r = (
const uint32_t *) _r;
 
  116         const uint32_t *
g = (
const uint32_t *) _g;
 
  117         const uint32_t *
b = (
const uint32_t *) _b;
 
  120         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
  121         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
  123 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1 
  126         av_assert2((((
r[Y1] + 
g[Y1] + 
b[Y1]) >> sh) & 0xFF) == 0xFF);
 
  128         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
  129         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
  132         uint8_t *dest = (uint8_t *) _dest;
 
  133         const uint8_t *
r = (
const uint8_t *) _r;
 
  134         const uint8_t *
g = (
const uint8_t *) _g;
 
  135         const uint8_t *
b = (
const uint8_t *) _b;
 
  137 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b) 
  138 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r) 
  140         dest[
i * 6 + 0] = 
r_b[Y1];
 
  141         dest[
i * 6 + 1] =   
g[Y1];
 
  142         dest[
i * 6 + 2] = 
b_r[Y1];
 
  143         dest[
i * 6 + 3] = 
r_b[Y2];
 
  144         dest[
i * 6 + 4] =   
g[Y2];
 
  145         dest[
i * 6 + 5] = 
b_r[Y2];
 
  151         uint16_t *dest = (uint16_t *) _dest;
 
  152         const uint16_t *
r = (
const uint16_t *) _r;
 
  153         const uint16_t *
g = (
const uint16_t *) _g;
 
  154         const uint16_t *
b = (
const uint16_t *) _b;
 
  155         int dr1, dg1, db1, dr2, dg2, db2;
 
  180         dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  181         dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  183         uint8_t *dest = (uint8_t *) _dest;
 
  184         const uint8_t *
r = (
const uint8_t *) _r;
 
  185         const uint8_t *
g = (
const uint8_t *) _g;
 
  186         const uint8_t *
b = (
const uint8_t *) _b;
 
  187         int dr1, dg1, db1, dr2, dg2, db2;
 
  192             dr1 = dg1 = 
d32[(
i * 2 + 0) & 7];
 
  193             db1 =       
d64[(
i * 2 + 0) & 7];
 
  194             dr2 = dg2 = 
d32[(
i * 2 + 1) & 7];
 
  195             db2 =       
d64[(
i * 2 + 1) & 7];
 
  199             dr1 = db1 = 
d128[(
i * 2 + 0) & 7];
 
  200             dg1 =        
d64[(
i * 2 + 0) & 7];
 
  201             dr2 = db2 = 
d128[(
i * 2 + 1) & 7];
 
  202             dg2 =        
d64[(
i * 2 + 1) & 7];
 
  206             dest[
i] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1] +
 
  207                     ((
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2]) << 4);
 
  209             dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  210             dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  215 #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \ 
  217     Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \ 
  218     Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \ 
  219     U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \ 
  220     V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \ 
  221     r  =  c->table_rV[V];                                              \ 
  222     g  = (c->table_gU[U] + c->table_gV[V]);                            \ 
  223     b  =  c->table_bU[U];                                              \ 
  224     yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \ 
  225                   r, g, b, y, target, 0);                              \ 
  231                         const int16_t **lumSrc, 
int lumFilterSize,
 
  232                         const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  233                         const int16_t **chrVSrc, 
int chrFilterSize,
 
  234                         const int16_t **alpSrc, uint8_t *dest, 
int dstW,
 
  242     int len_count = (dstW + 1) >> 1;
 
  243     const void *
r, *
g, *
b;
 
  245     __m256i 
headroom  = __lasx_xvreplgr2vr_w(head);
 
  247     for (
i = 0; 
i < 
len; 
i++) {
 
  248         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  249         __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
 
  250         __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
 
  251         __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, 
temp;
 
  253         yl1_ev = __lasx_xvldrepl_w(&t, 0);
 
  269         for (j = 0; j < lumFilterSize; j++) {
 
  270             const int16_t *src_lum = lumSrc[j] + count_lum;
 
  271             temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
 
  272             DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
 
  273                       src_lum, 96, l_src1, l_src2, l_src3, l_src4);
 
  275             yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, 
temp, l_src1);
 
  276             yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, 
temp, l_src1);
 
  277             yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, 
temp, l_src2);
 
  278             yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, 
temp, l_src2);
 
  279             yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, 
temp, l_src3);
 
  280             yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, 
temp, l_src3);
 
  281             yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, 
temp, l_src4);
 
  282             yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, 
temp, l_src4);
 
  284         for (j = 0; j < chrFilterSize; j++) {
 
  285             DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
 
  287             DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
 
  289             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
  290             u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, 
temp, u_src1);
 
  291             u1_od  = __lasx_xvmaddwod_w_h(u1_od, 
temp, u_src1);
 
  292             v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, 
temp, v_src1);
 
  293             v1_od  = __lasx_xvmaddwod_w_h(v1_od, 
temp, v_src1);
 
  294             u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, 
temp, u_src2);
 
  295             u2_od  = __lasx_xvmaddwod_w_h(u2_od, 
temp, u_src2);
 
  296             v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, 
temp, v_src2);
 
  297             v2_od  = __lasx_xvmaddwod_w_h(v2_od, 
temp, v_src2);
 
  299         yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
 
  300         yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
 
  301         yl1_od = __lasx_xvsrai_w(yl1_od, 19);
 
  302         yh1_od = __lasx_xvsrai_w(yh1_od, 19);
 
  303         u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
 
  304         v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
 
  305         u1_od  = __lasx_xvsrai_w(u1_od, 19);
 
  306         v1_od  = __lasx_xvsrai_w(v1_od, 19);
 
  307         yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
 
  308         yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
 
  309         yl2_od = __lasx_xvsrai_w(yl2_od, 19);
 
  310         yh2_od = __lasx_xvsrai_w(yh2_od, 19);
 
  311         u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
 
  312         v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
 
  313         u2_od  = __lasx_xvsrai_w(u2_od, 19);
 
  314         v2_od  = __lasx_xvsrai_w(v2_od, 19);
 
  315         u1_ev  = __lasx_xvadd_w(u1_ev, 
headroom);
 
  316         v1_ev  = __lasx_xvadd_w(v1_ev, 
headroom);
 
  317         u1_od  = __lasx_xvadd_w(u1_od, 
headroom);
 
  318         v1_od  = __lasx_xvadd_w(v1_od, 
headroom);
 
  319         u2_ev  = __lasx_xvadd_w(u2_ev, 
headroom);
 
  320         v2_ev  = __lasx_xvadd_w(v2_ev, 
headroom);
 
  321         u2_od  = __lasx_xvadd_w(u2_od, 
headroom);
 
  322         v2_od  = __lasx_xvadd_w(v2_od, 
headroom);
 
  357         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  358         __m256i l_src1, l_src2, u_src, v_src;
 
  359         __m256i yl_ev, yl_od, yh_ev, yh_od;
 
  360         __m256i u_ev, u_od, v_ev, v_od, 
temp;
 
  362         yl_ev = __lasx_xvldrepl_w(&t, 0);
 
  370         for (j = 0; j < lumFilterSize; j++) {
 
  371             temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
 
  372             DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
 
  374             yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  375             yl_od  = __lasx_xvmaddwod_w_h(yl_od, 
temp, l_src1);
 
  376             yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, 
temp, l_src2);
 
  377             yh_od  = __lasx_xvmaddwod_w_h(yh_od, 
temp, l_src2);
 
  379         for (j = 0; j < chrFilterSize; j++) {
 
  380             DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  382             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
  383             u_ev  = __lasx_xvmaddwev_w_h(u_ev, 
temp, u_src);
 
  384             u_od  = __lasx_xvmaddwod_w_h(u_od, 
temp, u_src);
 
  385             v_ev  = __lasx_xvmaddwev_w_h(v_ev, 
temp, v_src);
 
  386             v_od  = __lasx_xvmaddwod_w_h(v_od, 
temp, v_src);
 
  388         yl_ev = __lasx_xvsrai_w(yl_ev, 19);
 
  389         yh_ev = __lasx_xvsrai_w(yh_ev, 19);
 
  390         yl_od = __lasx_xvsrai_w(yl_od, 19);
 
  391         yh_od = __lasx_xvsrai_w(yh_od, 19);
 
  392         u_ev  = __lasx_xvsrai_w(u_ev, 19);
 
  393         v_ev  = __lasx_xvsrai_w(v_ev, 19);
 
  394         u_od  = __lasx_xvsrai_w(u_od, 19);
 
  395         v_od  = __lasx_xvsrai_w(v_od, 19);
 
  396         u_ev  = __lasx_xvadd_w(u_ev, 
headroom);
 
  397         v_ev  = __lasx_xvadd_w(v_ev, 
headroom);
 
  398         u_od  = __lasx_xvadd_w(u_od, 
headroom);
 
  399         v_od  = __lasx_xvadd_w(v_od, 
headroom);
 
  420         int count_lum = count << 1;
 
  421         __m256i l_src, u_src, v_src;
 
  422         __m256i y_ev, y_od, 
u, v, 
temp;
 
  424         y_ev = __lasx_xvldrepl_w(&t, 0);
 
  428         for (j = 0; j < lumFilterSize; j++) {
 
  429             temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
 
  430             l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
 
  431             y_ev  = __lasx_xvmaddwev_w_h(y_ev, 
temp, l_src);
 
  432             y_od  = __lasx_xvmaddwod_w_h(y_od, 
temp, l_src);
 
  434         for (j = 0; j < chrFilterSize; j++) {
 
  435             DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
 
  437             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
  438             u_src = __lasx_vext2xv_w_h(u_src);
 
  439             v_src = __lasx_vext2xv_w_h(v_src);
 
  440             u     = __lasx_xvmaddwev_w_h(
u, 
temp, u_src);
 
  441             v     = __lasx_xvmaddwev_w_h(v, 
temp, v_src);
 
  443         y_ev = __lasx_xvsrai_w(y_ev, 19);
 
  444         y_od = __lasx_xvsrai_w(y_od, 19);
 
  445         u    = __lasx_xvsrai_w(
u, 19);
 
  446         v    = __lasx_xvsrai_w(v, 19);
 
  461         int count_lum = count << 1;
 
  462         __m256i l_src, u_src, v_src;
 
  463         __m256i y_ev, uv, 
temp;
 
  465         y_ev = __lasx_xvldrepl_w(&t, 0);
 
  467         for (j = 0; j < lumFilterSize; j++) {
 
  468             temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
 
  469             l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
 
  470             l_src = __lasx_vext2xv_w_h(l_src);
 
  471             y_ev  = __lasx_xvmaddwev_w_h(y_ev, 
temp, l_src);
 
  473         for (j = 0; j < chrFilterSize; j++) {
 
  474             u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
 
  475             v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
 
  476             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
  477             u_src = __lasx_xvilvl_d(v_src, u_src);
 
  478             u_src = __lasx_vext2xv_w_h(u_src);
 
  479             uv    = __lasx_xvmaddwev_w_h(uv, 
temp, u_src);
 
  481         y_ev = __lasx_xvsrai_w(y_ev, 19);
 
  482         uv   = __lasx_xvsrai_w(uv, 19);
 
  489     for (; count < len_count; count++) {
 
  495         for (j = 0; j < lumFilterSize; j++) {
 
  496             Y1 += lumSrc[j][count * 2]     * lumFilter[j];
 
  497             Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
 
  499         for (j = 0; j < chrFilterSize; j++) {
 
  500             U += chrUSrc[j][count] * chrFilter[j];
 
  501             V += chrVSrc[j][count] * chrFilter[j];
 
  513                       r, 
g, 
b, y, target, 0);
 
  519                         const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  520                         const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
  521                         int yalpha, 
int uvalpha, 
int y,
 
  524     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  525                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
  526                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 
  527     int yalpha1   = 4096 - yalpha;
 
  528     int uvalpha1  = 4096 - uvalpha;
 
  531     int len_count = (dstW + 1) >> 1;
 
  532     const void *
r, *
g, *
b;
 
  534     __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
 
  535     __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
 
  536     __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
 
  537     __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
 
  538     __m256i 
headroom   = __lasx_xvreplgr2vr_w(head);
 
  540     for (
i = 0; 
i < 
len; 
i += 16) {
 
  543         int c_dex = count << 1;
 
  544         __m256i y0_h, y0_l, y0, u0, 
v0;
 
  545         __m256i y1_h, y1_l, y1, u1, v1;
 
  546         __m256i y_l, y_h, 
u, v;
 
  548         DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  549                   buf1, i_dex, y0, u0, 
v0, y1);
 
  550         DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
 
  551         DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
 
  552         DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
 
  553         DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, 
v0, v1, u0, u1, 
v0, v1);
 
  554         y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
 
  555         y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
 
  556         u0   = __lasx_xvmul_w(u0, v_uvalpha1);
 
  557         v0   = __lasx_xvmul_w(
v0, v_uvalpha1);
 
  558         y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
 
  559         y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
 
  560         u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
 
  561         v    = __lasx_xvmadd_w(
v0, v_uvalpha, v1);
 
  562         y_l  = __lasx_xvsrai_w(y_l, 19);
 
  563         y_h  = __lasx_xvsrai_w(y_h, 19);
 
  564         u    = __lasx_xvsrai_w(
u, 19);
 
  565         v    = __lasx_xvsrai_w(v, 19);
 
  580         __m256i y0_l, y0, u0, 
v0;
 
  581         __m256i y1_l, y1, u1, v1;
 
  584         y0   = __lasx_xvldx(buf0, i_dex);
 
  585         u0   = __lasx_xvldrepl_d((ubuf0 + count), 0);
 
  586         v0   = __lasx_xvldrepl_d((vbuf0 + count), 0);
 
  587         y1   = __lasx_xvldx(buf1, i_dex);
 
  588         u1   = __lasx_xvldrepl_d((ubuf1 + count), 0);
 
  589         v1   = __lasx_xvldrepl_d((vbuf1 + count), 0);
 
  590         DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
 
  591         DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, 
v0, v1, u0, u1, 
v0, v1);
 
  592         y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
 
  593         u0   = __lasx_xvmul_w(u0, v_uvalpha1);
 
  594         v0   = __lasx_xvmul_w(
v0, v_uvalpha1);
 
  595         y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
 
  596         u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
 
  597         v    = __lasx_xvmadd_w(
v0, v_uvalpha, v1);
 
  598         y_l  = __lasx_xvsrai_w(y_l, 19);
 
  599         u    = __lasx_xvsrai_w(
u, 19);
 
  600         v    = __lasx_xvsrai_w(v, 19);
 
  609     for (; count < len_count; count++) {
 
  610         int Y1 = (buf0[count * 2]     * yalpha1  +
 
  611                   buf1[count * 2]     * yalpha)  >> 19;
 
  612         int Y2 = (buf0[count * 2 + 1] * yalpha1  +
 
  613                   buf1[count * 2 + 1] * yalpha) >> 19;
 
  614         int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
 
  615         int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
 
  623                       r, 
g, 
b, y, target, 0);
 
  629                         const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  630                         const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
  634     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
  636     int len       = (dstW - 15);
 
  637     int len_count = (dstW + 1) >> 1;
 
  638     const void *
r, *
g, *
b;
 
  640     if (uvalpha < 2048) {
 
  643         __m256i 
headroom  = __lasx_xvreplgr2vr_h(head);
 
  645         for (
i = 0; 
i < 
len; 
i += 16) {
 
  648             int c_dex = count << 1;
 
  649             __m256i src_y, src_u, src_v;
 
  650             __m256i 
u, v, y_l, y_h;
 
  652             DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
 
  653             src_v = __lasx_xvldx(vbuf0, c_dex);
 
  654             src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
 
  655             src_y = __lasx_xvsrari_h(src_y, 7);
 
  656             src_u = __lasx_xvsrari_h(src_u, 7);
 
  657             y_l   = __lasx_xvsllwil_w_h(src_y, 0);
 
  658             y_h   = __lasx_xvexth_w_h(src_y);
 
  659             u     = __lasx_xvaddwev_w_h(src_u, 
headroom);
 
  660             v     = __lasx_xvaddwod_w_h(src_u, 
headroom);
 
  673             __m256i src_y, src_u, src_v;
 
  676             src_y  = __lasx_xvldx(buf0, i_dex);
 
  677             src_u  = __lasx_xvldrepl_d((ubuf0 + count), 0);
 
  678             src_v  = __lasx_xvldrepl_d((vbuf0 + count), 0);
 
  679             src_u  = __lasx_xvilvl_d(src_v, src_u);
 
  680             y_l    = __lasx_xvsrari_h(src_y, 7);
 
  681             uv     = __lasx_xvsrari_h(src_u, 7);
 
  682             y_l    = __lasx_vext2xv_w_h(y_l);
 
  683             uv     = __lasx_vext2xv_w_h(uv);
 
  684             uv     = __lasx_xvaddwev_w_h(uv, 
headroom);
 
  691         for (; count < len_count; count++) {
 
  692             int Y1 = (buf0[count * 2    ] + 64) >> 7;
 
  693             int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
 
  694             int U  = (ubuf0[count]        + 64) >> 7;
 
  695             int V  = (vbuf0[count]        + 64) >> 7;
 
  703                           r, 
g, 
b, y, target, 0);
 
  706         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
  709         __m256i 
headroom    = __lasx_xvreplgr2vr_w(HEADROOM);
 
  711         for (
i = 0; 
i < 
len; 
i += 16) {
 
  714             int c_dex = count << 1;
 
  715             __m256i src_y, src_u0, src_v0, src_u1, src_v1;
 
  716             __m256i y_l, y_h, 
u, v;
 
  718             DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  719                       ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
 
  720             src_v1 = __lasx_xvldx(vbuf1, c_dex);
 
  721             src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
 
  722             src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
 
  723             src_y  = __lasx_xvsrari_h(src_y, 7);
 
  724             u      = __lasx_xvaddwev_w_h(src_u0, src_u1);
 
  725             v      = __lasx_xvaddwod_w_h(src_u0, src_u1);
 
  726             y_l    = __lasx_xvsllwil_w_h(src_y, 0);
 
  727             y_h    = __lasx_xvexth_w_h(src_y);
 
  728             u      = __lasx_xvsrari_w(
u, 8);
 
  729             v      = __lasx_xvsrari_w(v, 8);
 
  744             __m256i src_y, src_u0, src_v0, src_u1, src_v1;
 
  747             src_y  = __lasx_xvldx(buf0, i_dex);
 
  748             src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
 
  749             src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
 
  750             src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
 
  751             src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
 
  753             src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
 
  754             src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
 
  755             src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
 
  756             src_y  = __lasx_xvsrari_h(src_y, 7);
 
  757             uv     = __lasx_xvhaddw_w_h(src_u0, src_u0);
 
  758             src_y  = __lasx_vext2xv_w_h(src_y);
 
  759             uv     = __lasx_xvsrari_w(uv, 8);
 
  767         for (; count < len_count; count++) {
 
  768             int Y1 = (buf0[count * 2    ]         +  64) >> 7;
 
  769             int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
 
  770             int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
 
  771             int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
 
  779                           r, 
g, 
b, y, target, 0);
 
  784 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                \ 
  785 static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter,            \ 
  786                                    const int16_t **lumSrc, int lumFilterSize,          \ 
  787                                    const int16_t *chrFilter, const int16_t **chrUSrc,  \ 
  788                                    const int16_t **chrVSrc, int chrFilterSize,         \ 
  789                                    const int16_t **alpSrc, uint8_t *dest, int dstW,    \ 
  792     name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize,              \ 
  793                                      chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \ 
  794                                      alpSrc, dest, dstW, y, fmt, hasAlpha);            \ 
  797 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                               \ 
  798 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                        \ 
  799 static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2],               \ 
  800                                    const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  801                                    const int16_t *abuf[2], uint8_t *dest, int dstW,    \ 
  802                                    int yalpha, int uvalpha, int y)                     \ 
  804     name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest,                   \ 
  805                                      dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \ 
  808 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                 \ 
  809 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                       \ 
  810 static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0,                 \ 
  811                                    const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  812                                    const int16_t *abuf0, uint8_t *dest, int dstW,      \ 
  813                                    int uvalpha, int y)                                 \ 
  815     name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest,                 \ 
  816                                      dstW, uvalpha, y, fmt, hasAlpha);                 \ 
  822 #if CONFIG_SWSCALE_ALPHA 
  838     uint8_t *dest, 
int i, 
int R, 
int A, 
int G, 
int B,
 
  843     if ((
R | 
G | 
B) & 0xC0000000) {
 
  851         dest[0] = hasAlpha ? 
A : 255;
 
  865         dest[3] = hasAlpha ? 
A : 255;
 
  868         dest[0] = hasAlpha ? 
A : 255;
 
  882         dest[3] = hasAlpha ? 
A : 255;
 
  898             R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
 
  899             G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
 
  900             B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
 
  901             c->dither_error[0][
i] = err[0];
 
  902             c->dither_error[1][
i] = err[1];
 
  903             c->dither_error[2][
i] = err[2];
 
  904             r = 
R >> (isrgb8 ? 5 : 7);
 
  905             g = 
G >> (isrgb8 ? 5 : 6);
 
  906             b = 
B >> (isrgb8 ? 6 : 7);
 
  910             err[0] = 
R - 
r*(isrgb8 ? 36 : 255);
 
  911             err[1] = 
G - 
g*(isrgb8 ? 36 : 85);
 
  912             err[2] = 
B - 
b*(isrgb8 ? 85 : 255);
 
  917 #define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff)) 
  936 #define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2) 
  956             dest[0] = 
r + 2*
g + 8*
b;
 
  958             dest[0] = 
b + 2*
g + 8*
r;
 
  960             dest[0] = 
r + 8*
g + 64*
b;
 
  962             dest[0] = 
b + 4*
g + 32*
r;
 
  969 #define YUV2RGB_SETUP                                            \ 
  970     int y_offset   = c->yuv2rgb_y_offset;                        \ 
  971     int y_coeff    = c->yuv2rgb_y_coeff;                         \ 
  972     int v2r_coe    = c->yuv2rgb_v2r_coeff;                       \ 
  973     int v2g_coe    = c->yuv2rgb_v2g_coeff;                       \ 
  974     int u2g_coe    = c->yuv2rgb_u2g_coeff;                       \ 
  975     int u2b_coe    = c->yuv2rgb_u2b_coeff;                       \ 
  976     __m256i offset = __lasx_xvreplgr2vr_w(y_offset);             \ 
  977     __m256i coeff  = __lasx_xvreplgr2vr_w(y_coeff);              \ 
  978     __m256i v2r    = __lasx_xvreplgr2vr_w(v2r_coe);              \ 
  979     __m256i v2g    = __lasx_xvreplgr2vr_w(v2g_coe);              \ 
  980     __m256i u2g    = __lasx_xvreplgr2vr_w(u2g_coe);              \ 
  981     __m256i u2b    = __lasx_xvreplgr2vr_w(u2b_coe);              \ 
  984 #define YUV2RGB(y, u, v, R, G, B, offset, coeff,               \ 
  985                  y_temp, v2r, v2g, u2g, u2b)                   \ 
  987      y = __lasx_xvsub_w(y, offset);                            \ 
  988      y = __lasx_xvmul_w(y, coeff);                             \ 
  989      y = __lasx_xvadd_w(y, y_temp);                            \ 
  990      R = __lasx_xvmadd_w(y, v, v2r);                           \ 
  991      v = __lasx_xvmadd_w(y, v, v2g);                           \ 
  992      G = __lasx_xvmadd_w(v, u, u2g);                           \ 
  993      B = __lasx_xvmadd_w(y, u, u2b);                           \ 
  996 #define WRITE_FULL_A(r, g, b, a, t1, s)                                      \ 
  998     R = __lasx_xvpickve2gr_w(r, t1);                                         \ 
  999     G = __lasx_xvpickve2gr_w(g, t1);                                         \ 
 1000     B = __lasx_xvpickve2gr_w(b, t1);                                         \ 
 1001     A = __lasx_xvpickve2gr_w(a, t1);                                         \ 
 1003         A = av_clip_uint8(A);                                                \ 
 1004     yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\ 
 1008 #define WRITE_FULL(r, g, b, t1, s)                                            \ 
 1010     R = __lasx_xvpickve2gr_w(r, t1);                                          \ 
 1011     G = __lasx_xvpickve2gr_w(g, t1);                                          \ 
 1012     B = __lasx_xvpickve2gr_w(b, t1);                                          \ 
 1013     yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \ 
 1019                              const int16_t **lumSrc, 
int lumFilterSize,
 
 1020                              const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
 1021                              const int16_t **chrVSrc, 
int chrFilterSize,
 
 1022                              const int16_t **alpSrc, uint8_t *dest,
 
 1026     int i, j, 
B, 
G, 
R, 
A;
 
 1030     int a_temp     = 1 << 18;
 
 1032     int tempc      = templ - (128 << 19);
 
 1033     int ytemp      = 1 << 21;
 
 1034     int len        = dstW - 15;
 
 1035     __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
 
 1042     for (
i = 0; 
i < 
len; 
i += 16) {
 
 1043         __m256i l_src, u_src, v_src;
 
 1044         __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, 
temp;
 
 1045         __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
 1048         y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
 
 1049         u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
 
 1050         for (j = 0; j < lumFilterSize; j++) {
 
 1051             temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
 
 1052             l_src = __lasx_xvldx(lumSrc[j], n);
 
 1053             y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, 
temp);
 
 1054             y_od  = __lasx_xvmaddwod_w_h(y_od, l_src, 
temp);
 
 1056         for (j = 0; j < chrFilterSize; j++) {
 
 1057             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
 1058             DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
 
 1061                       v_src, 
temp, u_ev, v_ev);
 
 1063                       v_src, 
temp, u_od, v_od);
 
 1065         y_ev = __lasx_xvsrai_w(y_ev, 10);
 
 1066         y_od = __lasx_xvsrai_w(y_od, 10);
 
 1067         u_ev = __lasx_xvsrai_w(u_ev, 10);
 
 1068         u_od = __lasx_xvsrai_w(u_od, 10);
 
 1069         v_ev = __lasx_xvsrai_w(v_ev, 10);
 
 1070         v_od = __lasx_xvsrai_w(v_od, 10);
 
 1072                 y_temp, v2r, v2g, u2g, u2b);
 
 1074                 y_temp, v2r, v2g, u2g, u2b);
 
 1077             __m256i a_src, a_ev, a_od;
 
 1079             a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
 
 1080             for (j = 0; j < lumFilterSize; j++) {
 
 1081                 temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
 
 1082                 a_src = __lasx_xvldx(alpSrc[j], n);
 
 1083                 a_ev  = __lasx_xvmaddwev_w_h(a_ev, a_src, 
temp);
 
 1084                 a_od  = __lasx_xvmaddwod_w_h(a_od, a_src, 
temp);
 
 1086             a_ev = __lasx_xvsrai_w(a_ev, 19);
 
 1087             a_od = __lasx_xvsrai_w(a_od, 19);
 
 1123     if (dstW - 
i >= 8) {
 
 1124         __m256i l_src, u_src, v_src;
 
 1125         __m256i y_ev, u_ev, v_ev, uv, 
temp;
 
 1126         __m256i R_ev, G_ev, B_ev;
 
 1129         y_ev = __lasx_xvreplgr2vr_w(templ);
 
 1130         u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
 
 1131         for (j = 0; j < lumFilterSize; j++) {
 
 1132             temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
 
 1133             l_src = __lasx_xvldx(lumSrc[j], n);
 
 1134             l_src = __lasx_xvpermi_d(l_src, 0xD8);
 
 1135             l_src = __lasx_xvilvl_h(l_src, l_src);
 
 1136             y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, 
temp);
 
 1138         for (j = 0; j < chrFilterSize; j++) {
 
 1139             temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
 
 1140             DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
 
 1141             u_src = __lasx_xvpermi_d(u_src, 0xD8);
 
 1142             v_src = __lasx_xvpermi_d(v_src, 0xD8);
 
 1143             uv    = __lasx_xvilvl_h(v_src, u_src);
 
 1144             u_ev  = __lasx_xvmaddwev_w_h(u_ev, uv, 
temp);
 
 1145             v_ev  = __lasx_xvmaddwod_w_h(v_ev, uv, 
temp);
 
 1147         y_ev = __lasx_xvsrai_w(y_ev, 10);
 
 1148         u_ev = __lasx_xvsrai_w(u_ev, 10);
 
 1149         v_ev = __lasx_xvsrai_w(v_ev, 10);
 
 1151                 y_temp, v2r, v2g, u2g, u2b);
 
 1154             __m256i a_src, a_ev;
 
 1156             a_ev = __lasx_xvreplgr2vr_w(a_temp);
 
 1157             for (j = 0; j < lumFilterSize; j++) {
 
 1158                 temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
 
 1159                 a_src = __lasx_xvldx(alpSrc[j], n);
 
 1160                 a_src = __lasx_xvpermi_d(a_src, 0xD8);
 
 1161                 a_src = __lasx_xvilvl_h(a_src, a_src);
 
 1162                 a_ev  =  __lasx_xvmaddwev_w_h(a_ev, a_src, 
temp);
 
 1164             a_ev = __lasx_xvsrai_w(a_ev, 19);
 
 1185     for (; 
i < dstW; 
i++) {
 
 1187         int V, 
U = 
V = tempc;
 
 1190         for (j = 0; j < lumFilterSize; j++) {
 
 1191             Y += lumSrc[j][
i] * lumFilter[j];
 
 1193         for (j = 0; j < chrFilterSize; j++) {
 
 1194             U += chrUSrc[j][
i] * chrFilter[j];
 
 1195             V += chrVSrc[j][
i] * chrFilter[j];
 
 1203             for (j = 0; j < lumFilterSize; j++) {
 
 1204                 A += alpSrc[j][
i] * lumFilter[j];
 
 1213         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1214         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1215         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1216         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1219     c->dither_error[0][
i] = err[0];
 
 1220     c->dither_error[1][
i] = err[1];
 
 1221     c->dither_error[2][
i] = err[2];
 
 1226                              const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1227                              const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
 1228                              int yalpha, 
int uvalpha, 
int y,
 
 1231     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
 1232                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
 1233                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
 
 1234                   *abuf0 = hasAlpha ? abuf[0] : 
NULL,
 
 1235                   *abuf1 = hasAlpha ? abuf[1] : 
NULL;
 
 1236     int yalpha1  = 4096 - yalpha;
 
 1237     int uvalpha1 = 4096 - uvalpha;
 
 1238     int uvtemp   = 128 << 19;
 
 1239     int atemp    = 1 << 18;
 
 1241     int ytemp    = 1 << 21;
 
 1242     int len      = dstW - 15;
 
 1246     __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
 
 1247     __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
 
 1248     __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
 
 1249     __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
 
 1250     __m256i uv         = __lasx_xvreplgr2vr_w(uvtemp);
 
 1251     __m256i a_bias     = __lasx_xvreplgr2vr_w(atemp);
 
 1252     __m256i y_temp     = __lasx_xvreplgr2vr_w(ytemp);
 
 1262     for (
i = 0; 
i < 
len; 
i += 16) {
 
 1263         __m256i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1264         __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
 
 1265         __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
 
 1266         __m256i y_l, y_h, v_l, v_h, u_l, u_h;
 
 1267         __m256i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1270         DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
 
 1271                   n, ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1272         DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
 
 1274         DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
 
 1275                   u0_l, u1_l, v0_l, v1_l);
 
 1277         DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
 
 1278                   u0_h, u1_h, v0_h, v1_h);
 
 1279         y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
 
 1280         y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
 
 1281         u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
 
 1282         u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
 
 1283         v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
 
 1284         v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
 
 1285         y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
 
 1286         y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
 
 1287         u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
 
 1288         u_h  = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
 
 1289         v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
 
 1290         v_h  = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
 
 1291         u_l  = __lasx_xvsub_w(u_l, uv);
 
 1292         u_h  = __lasx_xvsub_w(u_h, uv);
 
 1293         v_l  = __lasx_xvsub_w(v_l, uv);
 
 1294         v_h  = __lasx_xvsub_w(v_h, uv);
 
 1295         y_l  = __lasx_xvsrai_w(y_l, 10);
 
 1296         y_h  = __lasx_xvsrai_w(y_h, 10);
 
 1297         u_l  = __lasx_xvsrai_w(u_l, 10);
 
 1298         u_h  = __lasx_xvsrai_w(u_h, 10);
 
 1299         v_l  = __lasx_xvsrai_w(v_l, 10);
 
 1300         v_h  = __lasx_xvsrai_w(v_h, 10);
 
 1302                 y_temp, v2r, v2g, u2g, u2b);
 
 1304                 y_temp, v2r, v2g, u2g, u2b);
 
 1307             __m256i 
a0, 
a1, a0_l, a0_h;
 
 1308             __m256i a_l, a_h, a1_l, a1_h;
 
 1313             a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
 
 1314             a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
 
 1315             a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
 
 1316             a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
 
 1317             a_l = __lasx_xvsrai_w(a_l, 19);
 
 1318             a_h = __lasx_xvsrai_w(a_h, 19);
 
 1354     if (dstW - 
i >= 8) {
 
 1355         __m256i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1356         __m256i y0_l, y1_l, u0_l;
 
 1357         __m256i v0_l, u1_l, v1_l;
 
 1358         __m256i y_l, u_l, v_l;
 
 1359         __m256i R_l, G_l, B_l;
 
 1362         DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
 
 1363                   ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1364         DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
 
 1366         DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
 
 1367                   u0_l, u1_l, v0_l, v1_l);
 
 1368         y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
 
 1369         u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
 
 1370         v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
 
 1371         y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
 
 1372         u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
 
 1373         v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
 
 1374         u_l  = __lasx_xvsub_w(u_l, uv);
 
 1375         v_l  = __lasx_xvsub_w(v_l, uv);
 
 1376         y_l  = __lasx_xvsrai_w(y_l, 10);
 
 1377         u_l  = __lasx_xvsrai_w(u_l, 10);
 
 1378         v_l  = __lasx_xvsrai_w(v_l, 10);
 
 1380                 y_temp, v2r, v2g, u2g, u2b);
 
 1383             __m256i 
a0, 
a1, a0_l;
 
 1388             a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
 
 1389             a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
 
 1390             a_l = __lasx_xvsrai_w(a_l, 19);
 
 1411     for (; 
i < dstW; 
i++){
 
 1412         int Y = ( buf0[
i] * yalpha1  +  buf1[
i] * yalpha         ) >> 10;
 
 1413         int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1414         int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1418             A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
 
 1426         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1427         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1428         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1429         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1432     c->dither_error[0][
i] = err[0];
 
 1433     c->dither_error[1][
i] = err[1];
 
 1434     c->dither_error[2][
i] = err[2];
 
 1439                              const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1440                              const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
 1444     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
 1448     int ytemp      = 1 << 21;
 
 1450     int len        = dstW - 15;
 
 1451     __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
 
 1457     if (uvalpha < 2048) {
 
 1458         int uvtemp   = 128 << 7;
 
 1459         __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
 
 1460         __m256i 
bias = __lasx_xvreplgr2vr_w(bias_int);
 
 1462         for (
i = 0; 
i < 
len; 
i += 16) {
 
 1463             __m256i 
b, 
ub, vb, ub_l, ub_h, vb_l, vb_h;
 
 1464             __m256i y_l, y_h, u_l, u_h, v_l, v_h;
 
 1465             __m256i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1469             vb  = __lasx_xvldx(vbuf0, n);
 
 1470             y_l = __lasx_xvsllwil_w_h(
b, 2);
 
 1471             y_h = __lasx_xvexth_w_h(
b);
 
 1472             DUP2_ARG2(__lasx_xvsllwil_w_h, 
ub, 0, vb, 0, ub_l, vb_l);
 
 1474             y_h = __lasx_xvslli_w(y_h, 2);
 
 1475             u_l = __lasx_xvsub_w(ub_l, uv);
 
 1476             u_h = __lasx_xvsub_w(ub_h, uv);
 
 1477             v_l = __lasx_xvsub_w(vb_l, uv);
 
 1478             v_h = __lasx_xvsub_w(vb_h, uv);
 
 1479             u_l = __lasx_xvslli_w(u_l, 2);
 
 1480             u_h = __lasx_xvslli_w(u_h, 2);
 
 1481             v_l = __lasx_xvslli_w(v_l, 2);
 
 1482             v_h = __lasx_xvslli_w(v_h, 2);
 
 1484                     y_temp, v2r, v2g, u2g, u2b);
 
 1486                     y_temp, v2r, v2g, u2g, u2b);
 
 1492                 a_src = __lasx_xvld(abuf0 + 
i, 0);
 
 1493                 a_l   = __lasx_xvsllwil_w_h(a_src, 0);
 
 1494                 a_h   = __lasx_xvexth_w_h(a_src);
 
 1495                 a_l   = __lasx_xvadd_w(a_l, 
bias);
 
 1496                 a_h   = __lasx_xvadd_w(a_h, 
bias);
 
 1497                 a_l   = __lasx_xvsrai_w(a_l, 7);
 
 1498                 a_h   = __lasx_xvsrai_w(a_h, 7);
 
 1534         if (dstW - 
i >= 8) {
 
 1535             __m256i 
b, 
ub, vb, ub_l, vb_l;
 
 1536             __m256i y_l, u_l, v_l;
 
 1537             __m256i R_l, G_l, B_l;
 
 1541             vb  = __lasx_xvldx(vbuf0, n);
 
 1542             y_l = __lasx_vext2xv_w_h(
b);
 
 1543             DUP2_ARG1(__lasx_vext2xv_w_h, 
ub, vb, ub_l, vb_l);
 
 1544             y_l = __lasx_xvslli_w(y_l, 2);
 
 1545             u_l = __lasx_xvsub_w(ub_l, uv);
 
 1546             v_l = __lasx_xvsub_w(vb_l, uv);
 
 1547             u_l = __lasx_xvslli_w(u_l, 2);
 
 1548             v_l = __lasx_xvslli_w(v_l, 2);
 
 1550                     y_temp, v2r, v2g, u2g, u2b);
 
 1555                 a_src = __lasx_xvldx(abuf0, n);
 
 1556                 a_src = __lasx_vext2xv_w_h(a_src);
 
 1557                 a_l   = __lasx_xvadd_w(
bias, a_src);
 
 1558                 a_l   = __lasx_xvsrai_w(a_l, 7);
 
 1579         for (; 
i < dstW; 
i++) {
 
 1580             int Y = buf0[
i] << 2;
 
 1581             int U = (ubuf0[
i] - uvtemp) << 2;
 
 1582             int V = (vbuf0[
i] - uvtemp) << 2;
 
 1586                 A = (abuf0[
i] + 64) >> 7;
 
 1593             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1594             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1595             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1596             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1600         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
 1601         int uvtemp   = 128 << 8;
 
 1602         __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
 
 1603         __m256i 
zero = __lasx_xvldi(0);
 
 1604         __m256i 
bias = __lasx_xvreplgr2vr_h(bias_int);
 
 1606         for (
i = 0; 
i < 
len; 
i += 16) {
 
 1607             __m256i 
b, ub0, ub1, vb0, vb1;
 
 1608             __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
 
 1609             __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
 1612             DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1613                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1614             vb1 = __lasx_xvldx(vbuf, n);
 
 1615             y_ev = __lasx_xvaddwev_w_h(
b, 
zero);
 
 1616             y_od = __lasx_xvaddwod_w_h(
b, 
zero);
 
 1617             DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
 
 1618             DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
 
 1619             DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
 
 1620             DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
 
 1621                       u_ev, u_od, v_ev, v_od);
 
 1622             DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
 
 1623                       u_ev, u_od, v_ev, v_od);
 
 1625                     y_temp, v2r, v2g, u2g, u2b);
 
 1627                     y_temp, v2r, v2g, u2g, u2b);
 
 1633                 a_src = __lasx_xvld(abuf0 + 
i, 0);
 
 1634                 a_ev  = __lasx_xvaddwev_w_h(
bias, a_src);
 
 1635                 a_od  = __lasx_xvaddwod_w_h(
bias, a_src);
 
 1636                 a_ev  = __lasx_xvsrai_w(a_ev, 7);
 
 1637                 a_od  = __lasx_xvsrai_w(a_od, 7);
 
 1673         if (dstW - 
i >= 8) {
 
 1674             __m256i 
b, ub0, ub1, vb0, vb1;
 
 1675             __m256i y_l, u_l, v_l;
 
 1676             __m256i R_l, G_l, B_l;
 
 1679             DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1680                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1681             vb1 = __lasx_xvldx(vbuf1, n);
 
 1682             y_l = __lasx_vext2xv_w_h(
b);
 
 1683             y_l = __lasx_xvslli_w(y_l, 2);
 
 1684             DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
 
 1685                       ub0, vb0, ub1, vb1);
 
 1686             DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
 
 1687             u_l = __lasx_xvsub_w(u_l, uv);
 
 1688             v_l = __lasx_xvsub_w(v_l, uv);
 
 1689             u_l = __lasx_xvslli_w(u_l, 1);
 
 1690             v_l = __lasx_xvslli_w(v_l, 1);
 
 1692                     y_temp, v2r, v2g, u2g, u2b);
 
 1698                 a_src  = __lasx_xvld(abuf0 + 
i, 0);
 
 1699                 a_src  = __lasx_xvpermi_d(a_src, 0xD8);
 
 1700                 a_src  = __lasx_xvilvl_h(a_src, a_src);
 
 1701                 a_l    = __lasx_xvaddwev_w_h(
bias, a_src);
 
 1702                 a_l   = __lasx_xvsrai_w(a_l, 7);
 
 1723         for (; 
i < dstW; 
i++) {
 
 1724             int Y = buf0[
i] << 2;
 
 1725             int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
 
 1726             int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
 
 1730                 A = (abuf0[
i] + 64) >> 7;
 
 1737             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1738             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1739             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1740             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1744     c->dither_error[0][
i] = err[0];
 
 1745     c->dither_error[1][
i] = err[1];
 
 1746     c->dither_error[2][
i] = err[2];
 
 1750                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1752                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1754                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1756                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1758 #if CONFIG_SWSCALE_ALPHA 
 1782         switch (
c->dstFormat) {
 
 1785             c->yuv2packedX = yuv2rgba32_full_X_lasx;
 
 1786             c->yuv2packed2 = yuv2rgba32_full_2_lasx;
 
 1787             c->yuv2packed1 = yuv2rgba32_full_1_lasx;
 
 1789 #if CONFIG_SWSCALE_ALPHA 
 1791                 c->yuv2packedX = yuv2rgba32_full_X_lasx;
 
 1792                 c->yuv2packed2 = yuv2rgba32_full_2_lasx;
 
 1793                 c->yuv2packed1 = yuv2rgba32_full_1_lasx;
 
 1797                 c->yuv2packedX = yuv2rgbx32_full_X_lasx;
 
 1798                 c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
 
 1799                 c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
 
 1805             c->yuv2packedX = yuv2argb32_full_X_lasx;
 
 1806             c->yuv2packed2 = yuv2argb32_full_2_lasx;
 
 1807             c->yuv2packed1 = yuv2argb32_full_1_lasx;
 
 1809 #if CONFIG_SWSCALE_ALPHA 
 1811                 c->yuv2packedX = yuv2argb32_full_X_lasx;
 
 1812                 c->yuv2packed2 = yuv2argb32_full_2_lasx;
 
 1813                 c->yuv2packed1 = yuv2argb32_full_1_lasx;
 
 1817                 c->yuv2packedX = yuv2xrgb32_full_X_lasx;
 
 1818                 c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
 
 1819                 c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
 
 1825             c->yuv2packedX = yuv2bgra32_full_X_lasx;
 
 1826             c->yuv2packed2 = yuv2bgra32_full_2_lasx;
 
 1827             c->yuv2packed1 = yuv2bgra32_full_1_lasx;
 
 1829 #if CONFIG_SWSCALE_ALPHA 
 1831                 c->yuv2packedX = yuv2bgra32_full_X_lasx;
 
 1832                 c->yuv2packed2 = yuv2bgra32_full_2_lasx;
 
 1833                 c->yuv2packed1 = yuv2bgra32_full_1_lasx;
 
 1837                 c->yuv2packedX = yuv2bgrx32_full_X_lasx;
 
 1838                 c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
 
 1839                 c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
 
 1845             c->yuv2packedX = yuv2abgr32_full_X_lasx;
 
 1846             c->yuv2packed2 = yuv2abgr32_full_2_lasx;
 
 1847             c->yuv2packed1 = yuv2abgr32_full_1_lasx;
 
 1849 #if CONFIG_SWSCALE_ALPHA 
 1851                 c->yuv2packedX = yuv2abgr32_full_X_lasx;
 
 1852                 c->yuv2packed2 = yuv2abgr32_full_2_lasx;
 
 1853                 c->yuv2packed1 = yuv2abgr32_full_1_lasx;
 
 1857                 c->yuv2packedX = yuv2xbgr32_full_X_lasx;
 
 1858                 c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
 
 1859                 c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
 
 1864             c->yuv2packedX = yuv2rgb24_full_X_lasx;
 
 1865             c->yuv2packed2 = yuv2rgb24_full_2_lasx;
 
 1866             c->yuv2packed1 = yuv2rgb24_full_1_lasx;
 
 1869             c->yuv2packedX = yuv2bgr24_full_X_lasx;
 
 1870             c->yuv2packed2 = yuv2bgr24_full_2_lasx;
 
 1871             c->yuv2packed1 = yuv2bgr24_full_1_lasx;
 
 1874             c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
 
 1875             c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
 
 1876             c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
 
 1879             c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
 
 1880             c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
 
 1881             c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
 
 1884             c->yuv2packedX = yuv2bgr8_full_X_lasx;
 
 1885             c->yuv2packed2 = yuv2bgr8_full_2_lasx;
 
 1886             c->yuv2packed1 = yuv2bgr8_full_1_lasx;
 
 1889             c->yuv2packedX = yuv2rgb8_full_X_lasx;
 
 1890             c->yuv2packed2 = yuv2rgb8_full_2_lasx;
 
 1891             c->yuv2packed1 = yuv2rgb8_full_1_lasx;
 
 1895         switch (
c->dstFormat) {
 
 1900 #if CONFIG_SWSCALE_ALPHA 
 1905                 c->yuv2packed1 = yuv2rgbx32_1_lasx;
 
 1906                 c->yuv2packed2 = yuv2rgbx32_2_lasx;
 
 1907                 c->yuv2packedX = yuv2rgbx32_X_lasx;
 
 1915 #if CONFIG_SWSCALE_ALPHA 
 1920                 c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
 
 1921                 c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
 
 1922                 c->yuv2packedX = yuv2rgbx32_1_X_lasx;
 
 1927             c->yuv2packed1 = yuv2rgb24_1_lasx;
 
 1928             c->yuv2packed2 = yuv2rgb24_2_lasx;
 
 1929             c->yuv2packedX = yuv2rgb24_X_lasx;
 
 1932             c->yuv2packed1 = yuv2bgr24_1_lasx;
 
 1933             c->yuv2packed2 = yuv2bgr24_2_lasx;
 
 1934             c->yuv2packedX = yuv2bgr24_X_lasx;
 
 1940             c->yuv2packed1 = yuv2rgb16_1_lasx;
 
 1941             c->yuv2packed2 = yuv2rgb16_2_lasx;
 
 1942             c->yuv2packedX = yuv2rgb16_X_lasx;
 
 1948             c->yuv2packed1 = yuv2rgb15_1_lasx;
 
 1949             c->yuv2packed2 = yuv2rgb15_2_lasx;
 
 1950             c->yuv2packedX = yuv2rgb15_X_lasx;
 
 1956             c->yuv2packed1 = yuv2rgb12_1_lasx;
 
 1957             c->yuv2packed2 = yuv2rgb12_2_lasx;
 
 1958             c->yuv2packedX = yuv2rgb12_X_lasx;
 
 1962             c->yuv2packed1 = yuv2rgb8_1_lasx;
 
 1963             c->yuv2packed2 = yuv2rgb8_2_lasx;
 
 1964             c->yuv2packedX = yuv2rgb8_X_lasx;
 
 1968             c->yuv2packed1 = yuv2rgb4_1_lasx;
 
 1969             c->yuv2packed2 = yuv2rgb4_2_lasx;
 
 1970             c->yuv2packedX = yuv2rgb4_X_lasx;
 
 1974             c->yuv2packed1 = yuv2rgb4b_1_lasx;
 
 1975             c->yuv2packed2 = yuv2rgb4b_2_lasx;
 
 1976             c->yuv2packedX = yuv2rgb4b_X_lasx;