27     64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
 
   31     64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
 
   32     64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
 
   33     64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
 
   34     64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
 
   38     90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
 
   39     90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
 
   40     88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
 
   41     85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
 
   42     82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
 
   43     78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
 
   44     73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
 
   45     67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
 
   46     61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
 
   47     54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
 
   48     46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
 
   49     38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
 
   50     31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
 
   51     22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
 
   52     13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
 
   53     4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
 
   57     90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
 
   58     80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
 
   59     57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
 
   60     25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
 
   64     89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
 
   67 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \ 
   68                          sum0, sum1, sum2, sum3, shift)       \ 
   70     __m128i vec0, vec1, vec2, vec3, vec4, vec5;               \ 
   71     __m128i cnst64 = __lsx_vldi(0x0840);                      \ 
   72     __m128i cnst83 = __lsx_vldi(0x0853);                      \ 
   73     __m128i cnst36 = __lsx_vldi(0x0824);                      \ 
   75     vec0 = __lsx_vdp2_w_h(in_r0, cnst64);                     \ 
   76     vec1 = __lsx_vdp2_w_h(in_l0, cnst83);                     \ 
   77     vec2 = __lsx_vdp2_w_h(in_r1, cnst64);                     \ 
   78     vec3 = __lsx_vdp2_w_h(in_l1, cnst36);                     \ 
   79     vec4 = __lsx_vdp2_w_h(in_l0, cnst36);                     \ 
   80     vec5 = __lsx_vdp2_w_h(in_l1, cnst83);                     \ 
   82     sum0 = __lsx_vadd_w(vec0, vec2);                          \ 
   83     sum1 = __lsx_vsub_w(vec0, vec2);                          \ 
   84     vec1 = __lsx_vadd_w(vec1, vec3);                          \ 
   85     vec4 = __lsx_vsub_w(vec4, vec5);                          \ 
   86     sum2 = __lsx_vsub_w(sum1, vec4);                          \ 
   87     sum3 = __lsx_vsub_w(sum0, vec1);                          \ 
   88     sum0 = __lsx_vadd_w(sum0, vec1);                          \ 
   89     sum1 = __lsx_vadd_w(sum1, vec4);                          \ 
   91     sum0 = __lsx_vsrari_w(sum0, shift);                       \ 
   92     sum1 = __lsx_vsrari_w(sum1, shift);                       \ 
   93     sum2 = __lsx_vsrari_w(sum2, shift);                       \ 
   94     sum3 = __lsx_vsrari_w(sum3, shift);                       \ 
   95     sum0 = __lsx_vsat_w(sum0, 15);                            \ 
   96     sum1 = __lsx_vsat_w(sum1, 15);                            \ 
   97     sum2 = __lsx_vsat_w(sum2, 15);                            \ 
   98     sum3 = __lsx_vsat_w(sum3, 15);                            \ 
  101 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \ 
  103     __m128i src0_r, src1_r, src2_r, src3_r;                              \ 
  104     __m128i src0_l, src1_l, src2_l, src3_l;                              \ 
  105     __m128i filter0, filter1, filter2, filter3;                          \ 
  106     __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;        \ 
  107     __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;        \ 
  108     __m128i sum0_r, sum1_r, sum2_r, sum3_r;                              \ 
  109     __m128i sum0_l, sum1_l, sum2_l, sum3_l;                              \ 
  111     DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7,     \ 
  112               src0_r, src1_r, src2_r, src3_r);                           \ 
  113     DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7,     \ 
  114               src0_l, src1_l, src2_l, src3_l);                           \ 
  116     DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8,          \ 
  117               filter, 12, filter0, filter1, filter2, filter3);           \ 
  118     DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \ 
  119               src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \ 
  122     LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ 
  129     DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \ 
  130               src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \ 
  132     temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \ 
  133     temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \ 
  134     sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \ 
  135     sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \ 
  136     sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \ 
  137     sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \ 
  139     in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \ 
  140     in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \ 
  142     DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \ 
  143               src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \ 
  145     temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \ 
  146     temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \ 
  147     sum1_r  = __lsx_vadd_w(sum1_r, temp4_r);                             \ 
  148     sum1_l  = __lsx_vadd_w(sum1_l, temp4_l);                             \ 
  149     sum2_r  = __lsx_vsub_w(sum2_r, temp4_r);                             \ 
  150     sum2_l  = __lsx_vsub_w(sum2_l, temp4_l);                             \ 
  152     in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \ 
  153     in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \ 
  155     DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24,       \ 
  156               filter, 28, filter0, filter1, filter2, filter3);           \ 
  157     DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \ 
  158               src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \ 
  161     LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\ 
  168     DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \ 
  169               src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \ 
  171     temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \ 
  172     temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \ 
  173     sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \ 
  174     sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \ 
  175     sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \ 
  176     sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \ 
  178     in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \ 
  179     in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \ 
  181     DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \ 
  182               src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \ 
  184     temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \ 
  185     temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \ 
  186     sum1_r  = __lsx_vsub_w(sum1_r, temp4_r);                             \ 
  187     sum1_l  = __lsx_vsub_w(sum1_l, temp4_l);                             \ 
  188     sum2_r  = __lsx_vadd_w(sum2_r, temp4_r);                             \ 
  189     sum2_l  = __lsx_vadd_w(sum2_l, temp4_l);                             \ 
  191     in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \ 
  192     in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \ 
  195 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                   \ 
  196                            src4_r, src5_r, src6_r, src7_r,                   \ 
  197                            src0_l, src1_l, src2_l, src3_l,                   \ 
  198                            src4_l, src5_l, src6_l, src7_l, shift)            \ 
  200     int16_t *ptr0, *ptr1;                                                    \ 
  201     __m128i dst0, dst1;                                                      \ 
  202     __m128i filter0, filter1, filter2, filter3;                              \ 
  203     __m128i temp0_r, temp1_r, temp0_l, temp1_l;                              \ 
  204     __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;          \ 
  205     __m128i sum3_l, res0_r, res1_r, res0_l, res1_l;                          \ 
  207     ptr0 = (buf_ptr + 112);                                                  \ 
  208     ptr1 = (buf_ptr + 128);                                                  \ 
  211     for (j = 0; j < 4; j++)                                                  \ 
  213         DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16,         \ 
  214                   filter, 20, filter0, filter1, filter2, filter3);           \ 
  215         DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \ 
  216                   src4_r, filter2, src4_l, filter2,  sum0_r, sum0_l,         \ 
  218         DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2,          \ 
  220         DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l,        \ 
  221                   src1_l, filter1, sum2_r, src5_r, filter3, sum2_l,          \ 
  222                   src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l);          \ 
  223         DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l,        \ 
  224                   src6_l, filter3, sum3_r, sum3_l);                          \ 
  229         DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24,        \ 
  230                   filter, 28, filter0, filter1, filter2, filter3);           \ 
  232         DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,          \ 
  234         DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l,        \ 
  235                   src6_l, filter2, sum2_r, sum2_l);                          \ 
  236         DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2,          \ 
  239         sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \ 
  240         sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \ 
  241         sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \ 
  242         sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \ 
  243         sum3_r = __lsx_vsub_w(temp1_r, sum3_r);                              \ 
  244         sum3_l = __lsx_vsub_w(temp1_l, sum3_l);                              \ 
  246         DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1,          \ 
  248         DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l,        \ 
  249                   src7_l, filter3, sum3_r, src4_r, filter3, sum3_l,          \ 
  250                   src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l);          \ 
  252         sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \ 
  253         sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \ 
  254         sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \ 
  255         sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \ 
  257         LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,    \ 
  259         dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \ 
  260         dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \ 
  261         __lsx_vst(dst0, buf_ptr, 0);                                         \ 
  262         __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0);               \ 
  264         LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,    \ 
  267         dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \ 
  268         dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \ 
  269         __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \ 
  270         __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \ 
  277 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \ 
  279     tmp0_r = __lsx_vld(input + load_idx * 8, 0);                      \ 
  280     tmp0_l = __lsx_vld(input + load_idx * 8, 16);                     \ 
  283     sum0_r = __lsx_vadd_w(sum0_r, tmp0_r);                            \ 
  284     sum0_l = __lsx_vadd_w(sum0_l, tmp0_l);                            \ 
  285     __lsx_vst(sum0_r, (input + load_idx * 8), 0);                     \ 
  286     __lsx_vst(sum0_l, (input + load_idx * 8), 16);                    \ 
  287     tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r);                            \ 
  288     tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l);                            \ 
  289     __lsx_vst(tmp1_r, (input + store_idx * 8), 0);                    \ 
  290     __lsx_vst(tmp1_l, (input + store_idx * 8), 16);                   \ 
  293 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \ 
  294                               res0, res1, res2, res3, shift)  \ 
  296     __m128i vec0, vec1, vec2, vec3;                           \ 
  297     __m128i cnst74 = __lsx_vldi(0x84a);                       \ 
  298     __m128i cnst55 = __lsx_vldi(0x837);                       \ 
  299     __m128i cnst29 = __lsx_vldi(0x81d);                       \ 
  301     vec0 = __lsx_vadd_w(in_r0, in_r1);                        \ 
  302     vec2 = __lsx_vsub_w(in_r0, in_l1);                        \ 
  303     res0 = __lsx_vmul_w(vec0, cnst29);                        \ 
  304     res1 = __lsx_vmul_w(vec2, cnst55);                        \ 
  305     res2 = __lsx_vsub_w(in_r0, in_r1);                        \ 
  306     vec1 = __lsx_vadd_w(in_r1, in_l1);                        \ 
  307     res2 = __lsx_vadd_w(res2, in_l1);                         \ 
  308     vec3 = __lsx_vmul_w(in_l0, cnst74);                       \ 
  309     res3 = __lsx_vmul_w(vec0, cnst55);                        \ 
  311     res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55));    \ 
  312     res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29));    \ 
  313     res2 = __lsx_vmul_w(res2, cnst74);                        \ 
  314     res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29));    \ 
  316     res0 = __lsx_vadd_w(res0, vec3);                          \ 
  317     res1 = __lsx_vadd_w(res1, vec3);                          \ 
  318     res3 = __lsx_vsub_w(res3, vec3);                          \ 
  320     res0 = __lsx_vsrari_w(res0, shift);                       \ 
  321     res1 = __lsx_vsrari_w(res1, shift);                       \ 
  322     res2 = __lsx_vsrari_w(res2, shift);                       \ 
  323     res3 = __lsx_vsrari_w(res3, shift);                       \ 
  324     res0 = __lsx_vsat_w(res0, 15);                            \ 
  325     res1 = __lsx_vsat_w(res1, 15);                            \ 
  326     res2 = __lsx_vsat_w(res2, 15);                            \ 
  327     res3 = __lsx_vsat_w(res3, 15);                            \ 
  333     __m128i in_r0, in_l0, in_r1, in_l1;
 
  334     __m128i sum0, sum1, sum2, sum3;
 
  335     __m128i 
zero = __lsx_vldi(0x00);
 
  337     in0   = __lsx_vld(coeffs, 0);
 
  338     in1   = __lsx_vld(coeffs, 16);
 
  339     in_r0 = __lsx_vilvl_h(
zero, in0);
 
  340     in_l0 = __lsx_vilvh_h(
zero, in0);
 
  341     in_r1 = __lsx_vilvl_h(
zero, in1);
 
  342     in_l1 = __lsx_vilvh_h(
zero, in1);
 
  345     LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
 
  349     in0  = __lsx_vpickev_h(sum2, sum0);
 
  350     in1  = __lsx_vpickev_h(sum3, sum1);
 
  351     sum0 = __lsx_vilvl_h(in1, in0);
 
  352     sum1 = __lsx_vilvh_h(in1, in0);
 
  353     in0  = __lsx_vilvl_w(sum1, sum0);
 
  354     in1  = __lsx_vilvh_w(sum1, sum0);
 
  356     __lsx_vst(in0, coeffs, 0);
 
  357     __lsx_vst(in1, coeffs, 16);
 
  363     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
  365     DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
 
  366               coeffs, 48, in0, in1, in2, in3);
 
  367     DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
 
  368               coeffs, 112, in4, in5, in6, in7);
 
  370     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  371                        in0, in1, in2, in3, in4, in5, in6, in7);
 
  373     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  374                        in0, in1, in2, in3, in4, in5, in6, in7);
 
  376     __lsx_vst(in0, coeffs, 0);
 
  377     __lsx_vst(in1, coeffs, 16);
 
  378     __lsx_vst(in2, coeffs, 32);
 
  379     __lsx_vst(in3, coeffs, 48);
 
  380     __lsx_vst(in4, coeffs, 64);
 
  381     __lsx_vst(in5, coeffs, 80);
 
  382     __lsx_vst(in6, coeffs, 96);
 
  383     __lsx_vst(in7, coeffs, 112);
 
  390     int16_t *buf_ptr = &buf[0];
 
  391     int16_t *
src = coeffs;
 
  393     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
  394     __m128i in8, in9, in10, in11, in12, in13, in14, in15;
 
  395     __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  396     __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 
  397     __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
 
  405                   in8, in9, in10, in11);
 
  407                   in12, in13, in14, in15);
 
  409         DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
 
  410                   src0_r, src1_r, src2_r, src3_r);
 
  411         DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
 
  412                   src4_r, src5_r, src6_r, src7_r);
 
  413         DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
 
  414                   src0_l, src1_l, src2_l, src3_l);
 
  415         DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
 
  416                   src4_l, src5_l, src6_l, src7_l);
 
  419                            src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
 
  420                            src4_l, src5_l, src6_l, src7_l, 7);
 
  423         buf_ptr = (&buf[0] + 8);
 
  435                   in2, in10, in3, in11);
 
  437                   in4, in12, in5, in13);
 
  439                   in6, in14, in7, in15);
 
  440         LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  441                            in0, in1, in2, in3, in4, in5, in6, in7);
 
  442         LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
 
  443                            in8, in9, in10, in11, in12, in13, in14, in15);
 
  444         DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
 
  445                   src0_r, src1_r, src2_r, src3_r);
 
  446         DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
 
  447                   src4_r, src5_r, src6_r, src7_r);
 
  448         DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
 
  449                   src0_l, src1_l, src2_l, src3_l);
 
  450         DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
 
  451                   src4_l, src5_l, src6_l, src7_l);
 
  453                            src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
 
  454                            src4_l, src5_l, src6_l, src7_l, 12);
 
  457         buf_ptr = coeffs + 8;
 
  461     DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
 
  463     DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
 
  465     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  466                        vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
 
  467     __lsx_vst(vec0, coeffs, 0);
 
  468     __lsx_vst(vec1, coeffs, 32);
 
  469     __lsx_vst(vec2, coeffs, 64);
 
  470     __lsx_vst(vec3, coeffs, 96);
 
  471     __lsx_vst(vec4, coeffs, 128);
 
  472     __lsx_vst(vec5, coeffs, 160);
 
  473     __lsx_vst(vec6, coeffs, 192);
 
  474     __lsx_vst(vec7, coeffs, 224);
 
  477     DUP4_ARG2(__lsx_vld, 
src, 0, 
src, 32, 
src, 64, 
src, 96, in0, in1, in2, in3);
 
  480     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  481                        vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
 
  484               in8, in9, in10, in11);
 
  486               in12, in13, in14, in15);
 
  488     __lsx_vst(vec0, 
src, 0);
 
  489     __lsx_vst(vec1, 
src, 32);
 
  490     __lsx_vst(vec2, 
src, 64);
 
  491     __lsx_vst(vec3, 
src, 96);
 
  492     __lsx_vst(vec4, 
src, 128);
 
  493     __lsx_vst(vec5, 
src, 160);
 
  494     __lsx_vst(vec6, 
src, 192);
 
  495     __lsx_vst(vec7, 
src, 224);
 
  496     LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
 
  497                        vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
 
  499     __lsx_vst(vec0, 
src, 0);
 
  500     __lsx_vst(vec1, 
src, 32);
 
  501     __lsx_vst(vec2, 
src, 64);
 
  502     __lsx_vst(vec3, 
src, 96);
 
  503     __lsx_vst(vec4, 
src, 128);
 
  504     __lsx_vst(vec5, 
src, 160);
 
  505     __lsx_vst(vec6, 
src, 192);
 
  506     __lsx_vst(vec7, 
src, 224);
 
  513     LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  514                        vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
 
  515     __lsx_vst(vec0, 
src, 0);
 
  516     __lsx_vst(vec1, 
src, 32);
 
  517     __lsx_vst(vec2, 
src, 64);
 
  518     __lsx_vst(vec3, 
src, 96);
 
  519     __lsx_vst(vec4, 
src, 128);
 
  520     __lsx_vst(vec5, 
src, 160);
 
  521     __lsx_vst(vec6, 
src, 192);
 
  522     __lsx_vst(vec7, 
src, 224);
 
  529     int32_t buf_pitch_2  = buf_pitch << 1;
 
  530     int32_t buf_pitch_4  = buf_pitch << 2;
 
  531     int32_t buf_pitch_8  = buf_pitch << 3;
 
  532     int32_t buf_pitch_16 = buf_pitch << 4;
 
  538     int16_t *
src0 = (coeffs + buf_pitch);
 
  539     int16_t *
src1 = (coeffs + buf_pitch_2);
 
  540     int16_t *
src2 = (coeffs + buf_pitch_4);
 
  541     int16_t *src3 = (coeffs);
 
  543     int32_t *tmp_buf_ptr = tmp_buf + 15;
 
  544     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
  545     __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 
  546     __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
 
  548     __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
 
  551     tmp_buf_ptr = (
int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
 
  554     in0 = __lsx_vld(
src2, 0);
 
  555     in1 = __lsx_vld(
src2 + buf_pitch_8, 0);
 
  556     in2 = __lsx_vld(
src2 + buf_pitch_16, 0);
 
  557     in3 = __lsx_vld(
src2 + buf_pitch_16 + buf_pitch_8, 0);
 
  558     in4 = __lsx_vld(src3, 0);
 
  559     in5 = __lsx_vld(src3 + buf_pitch_8, 0);
 
  560     in6 = __lsx_vld(src3 + buf_pitch_16, 0);
 
  561     in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
 
  562     DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
 
  563               src0_r, src1_r, src2_r, src3_r);
 
  564     DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
 
  565               src0_l, src1_l, src2_l, src3_l);
 
  567     filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
 
  568     filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
 
  569     sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  570     sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  571     sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  572     sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  573     __lsx_vst(sum0_r, tmp_buf_ptr, 0);
 
  574     __lsx_vst(sum0_l, tmp_buf_ptr, 16);
 
  576     filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
 
  577     filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
 
  578     sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  579     sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  580     sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  581     sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  582     __lsx_vst(sum0_r, tmp_buf_ptr, 32);
 
  583     __lsx_vst(sum0_l, tmp_buf_ptr, 48);
 
  585     filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
 
  586     filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
 
  587     sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  588     sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  589     sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  590     sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  591     __lsx_vst(sum0_r, tmp_buf_ptr, 64);
 
  592     __lsx_vst(sum0_l, tmp_buf_ptr, 80);
 
  594     filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
 
  595     filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
 
  596     sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  597     sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  598     sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  599     sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  600     __lsx_vst(sum0_r, tmp_buf_ptr, 96);
 
  601     __lsx_vst(sum0_l, tmp_buf_ptr, 112);
 
  604     filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
 
  605     filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
 
  608               src3_r, 
filter1, src3_l, 
filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
 
  609     sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
 
  610     sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
 
  611     sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
 
  612     sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
 
  617     filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
 
  618     filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
 
  621               src3_r, 
filter1, src3_l, 
filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
 
  622     sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
 
  623     sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
 
  624     sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
 
  625     sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
 
  631     in0 = __lsx_vld(
src1, 0);
 
  632     in1 = __lsx_vld(
src1 + buf_pitch_4, 0);
 
  633     in2 = __lsx_vld(
src1 + buf_pitch_8, 0);
 
  634     in3 = __lsx_vld(
src1 + buf_pitch_8 + buf_pitch_4, 0);
 
  635     in4 = __lsx_vld(
src1 + buf_pitch_16, 0);
 
  636     in5 = __lsx_vld(
src1 + buf_pitch_16 + buf_pitch_4, 0);
 
  637     in6 = __lsx_vld(
src1 + buf_pitch_16 + buf_pitch_8, 0);
 
  638     in7 = __lsx_vld(
src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
 
  640     DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  641               src0_r, src1_r, src2_r, src3_r);
 
  642     DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  643               src0_l, src1_l, src2_l, src3_l);
 
  646     for (
i = 0; 
i < 8; 
i++) {
 
  648         filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
 
  649         filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
 
  650         filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
 
  651         filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
 
  652         sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  653         sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  654         sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  655         sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  656         sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
 
  657         sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
 
  658         sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
 
  659         sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
 
  661         tmp0_r = __lsx_vld(tmp_buf_ptr + (
i << 3), 0);
 
  662         tmp0_l = __lsx_vld(tmp_buf_ptr + (
i << 3), 16);
 
  665         tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
 
  666         tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
 
  667         tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
 
  668         tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
 
  669         __lsx_vst(tmp0_r, tmp_buf_ptr + (
i << 3), 0);
 
  670         __lsx_vst(tmp0_l, tmp_buf_ptr + (
i << 3), 16);
 
  671         __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - 
i) * 8), 0);
 
  672         __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - 
i) * 8), 16);
 
  678     in0 = __lsx_vld(
src0, 0);
 
  679     in1 = __lsx_vld(
src0 + buf_pitch_2, 0);
 
  680     in2 = __lsx_vld(
src0 + buf_pitch_4, 0);
 
  681     in3 = __lsx_vld(
src0 + buf_pitch_4 + buf_pitch_2, 0);
 
  682     in4 = __lsx_vld(
src0 + buf_pitch_8, 0);
 
  683     in5 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_2, 0);
 
  684     in6 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_4, 0);
 
  685     in7 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
 
  687     src0 += 16 * buf_pitch;
 
  688     DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  689               src0_r, src1_r, src2_r, src3_r);
 
  690     DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  691               src0_l, src1_l, src2_l, src3_l);
 
  692     in0 = __lsx_vld(
src0, 0);
 
  693     in1 = __lsx_vld(
src0 + buf_pitch_2, 0);
 
  694     in2 = __lsx_vld(
src0 + buf_pitch_4, 0);
 
  695     in3 = __lsx_vld(
src0 + buf_pitch_4 + buf_pitch_2, 0);
 
  696     in4 = __lsx_vld(
src0 + buf_pitch_8, 0);
 
  697     in5 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_2, 0);
 
  698     in6 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_4, 0);
 
  699     in7 = __lsx_vld(
src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
 
  701     DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  702               src4_r, src5_r, src6_r, src7_r);
 
  703     DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
 
  704               src4_l, src5_l, src6_l, src7_l);
 
  707     for (
i = 0; 
i < 16; 
i++) {
 
  709         filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
 
  710         filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
 
  711         filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
 
  712         filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
 
  713         sum0_r = __lsx_vdp2_w_h(src0_r, 
filter0);
 
  714         sum0_l = __lsx_vdp2_w_h(src0_l, 
filter0);
 
  715         sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, 
filter1);
 
  716         sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, 
filter1);
 
  717         sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
 
  718         sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
 
  719         sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
 
  720         sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
 
  724         filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
 
  725         filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
 
  726         filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
 
  727         filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
 
  728         sum0_r = __lsx_vdp2_w_h(src4_r, 
filter0);
 
  729         sum0_l = __lsx_vdp2_w_h(src4_l, 
filter0);
 
  730         sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, 
filter1);
 
  731         sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, 
filter1);
 
  732         sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
 
  733         sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
 
  734         sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
 
  735         sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
 
  736         sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
 
  737         sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
 
  739         tmp0_r = __lsx_vld(tmp_buf_ptr + 
i * 8, 0);
 
  740         tmp0_l = __lsx_vld(tmp_buf_ptr + 
i * 8, 16);
 
  743         tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
 
  744         tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
 
  745         sum1_r = __lsx_vreplgr2vr_w(
round);
 
  746         tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
 
  747         tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
 
  748         in0    = __lsx_vpackev_d(tmp0_l, tmp0_r);
 
  749         __lsx_vst(in0, (coeffs + 
i * buf_pitch), 0);
 
  750         tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
 
  751         tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
 
  752         tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
 
  753         tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
 
  754         in0    = __lsx_vpackev_d(tmp1_l, tmp1_r);
 
  755         __lsx_vst(in0, (coeffs + (31 - 
i) * buf_pitch), 0);
 
  764     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
  766     for (
i = 0; 
i < 4; 
i++) {
 
  767         DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
 
  768                   coeffs, 192, in0, in1, in2, in3);
 
  769         DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
 
  770                   coeffs, 448, in4, in5, in6, in7);
 
  772         LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  773                            in0, in1, in2, in3, in4, in5, in6, in7);
 
  774         __lsx_vst(in0, tmp_buf, 0);
 
  775         __lsx_vst(in1, tmp_buf, 16);
 
  776         __lsx_vst(in2, tmp_buf, 32);
 
  777         __lsx_vst(in3, tmp_buf, 48);
 
  778         __lsx_vst(in4, tmp_buf, 64);
 
  779         __lsx_vst(in5, tmp_buf, 80);
 
  780         __lsx_vst(in6, tmp_buf, 96);
 
  781         __lsx_vst(in7, tmp_buf, 112);
 
  789     __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
  791     for (
i = 0; 
i < 4; 
i++) {
 
  792         DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
 
  793                   tmp_buf, 48, in0, in1, in2, in3);
 
  794         DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
 
  795                   tmp_buf, 112, in4, in5, in6, in7);
 
  797         LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
 
  798                            in0, in1, in2, in3, in4, in5, in6, in7);
 
  799         __lsx_vst(in0, coeffs, 0);
 
  800         __lsx_vst(in1, coeffs, 64);
 
  801         __lsx_vst(in2, coeffs, 128);
 
  802         __lsx_vst(in3, coeffs, 192);
 
  803         __lsx_vst(in4, coeffs, 256);
 
  804         __lsx_vst(in5, coeffs, 320);
 
  805         __lsx_vst(in6, coeffs, 384);
 
  806         __lsx_vst(in7, coeffs, 448);
 
  813     uint8_t row_cnt, col_cnt;
 
  814     int16_t *
src = coeffs;
 
  815     int16_t tmp_buf[8 * 32 + 31];
 
  816     int16_t *tmp_buf_ptr = tmp_buf + 31;
 
  821     tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
 
  826     for (col_cnt = 0; col_cnt < 4; col_cnt++) {
 
  834     for (row_cnt = 0; row_cnt < 4; row_cnt++) {
 
  836         src = (coeffs + 32 * 8 * row_cnt);