25 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \ 
   27     out0 = __msa_subs_u_h(out0, in0);            \ 
   28     out1 = __msa_subs_u_h(out1, in1);            \ 
   39     for (row = 16; row--;) {
 
   54     for (row = 32; row--;) {
 
   67     for (row = 4; row--;) {
 
   71         src0 = (v16u8) __msa_fill_b(inp >> 24);
 
   72         src1 = (v16u8) __msa_fill_b(inp >> 16);
 
   73         src2 = (v16u8) __msa_fill_b(inp >> 8);
 
   74         src3 = (v16u8) __msa_fill_b(inp);
 
   77         dst += (4 * dst_stride);
 
   88     for (row = 8; row--;) {
 
   92         src0 = (v16u8) __msa_fill_b(inp >> 24);
 
   93         src1 = (v16u8) __msa_fill_b(inp >> 16);
 
   94         src2 = (v16u8) __msa_fill_b(inp >> 8);
 
   95         src3 = (v16u8) __msa_fill_b(inp);
 
  101         ST_UB2(src2, src2, dst, 16);
 
  103         ST_UB2(src3, src3, dst, 16);
 
  112     v16i8 store, 
src = { 0 };
 
  120     sum_h = __msa_hadd_u_h((v16u8) 
src, (v16u8) 
src);
 
  121     sum_w = __msa_hadd_u_w(sum_h, sum_h);
 
  122     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  123     sum_w = (v4u32) __msa_srari_w((v4i32) 
sum_d, 3);
 
  124     store = __msa_splati_b((v16i8) sum_w, 0);
 
  125     val0 = __msa_copy_u_w((v4i32) store, 0);
 
  127     SW4(val0, val0, val0, val0, dst, dst_stride);
 
  130 #define INTRA_DC_TL_4x4(dir)                                    \ 
  131 void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \ 
  132                            const uint8_t *left,                 \ 
  133                            const uint8_t *top)                  \ 
  136     v16i8 store, data = { 0 };                                  \ 
  141     data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \ 
  142     sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \ 
  143     sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \ 
  144     sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \ 
  145     store = __msa_splati_b((v16i8) sum_w, 0);                   \ 
  146     val0 = __msa_copy_u_w((v4i32) store, 0);                    \ 
  148     SW4(val0, val0, val0, val0, dst, dst_stride);               \ 
  166     sum_h = __msa_hadd_u_h(
src, 
src);
 
  167     sum_w = __msa_hadd_u_w(sum_h, sum_h);
 
  168     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  169     sum_w = (v4u32) __msa_pckev_w((v4i32) 
sum_d, (v4i32) 
sum_d);
 
  170     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  171     sum_w = (v4u32) __msa_srari_w((v4i32) 
sum_d, 4);
 
  172     store = __msa_splati_b((v16i8) sum_w, 0);
 
  173     val0 = __msa_copy_u_d((v2i64) store, 0);
 
  175     SD4(val0, val0, val0, val0, dst, dst_stride);
 
  176     dst += (4 * dst_stride);
 
  177     SD4(val0, val0, val0, val0, dst, dst_stride);
 
  180 #define INTRA_DC_TL_8x8(dir)                                    \ 
  181 void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \ 
  182                            const uint8_t *left,                 \ 
  183                            const uint8_t *top)                  \ 
  187     v16u8 data = { 0 };                                         \ 
  193     data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \ 
  194     sum_h = __msa_hadd_u_h(data, data);                         \ 
  195     sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \ 
  196     sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \ 
  197     sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \ 
  198     store = __msa_splati_b((v16i8) sum_w, 0);                   \ 
  199     val0 = __msa_copy_u_d((v2i64) store, 0);                    \ 
  201     SD4(val0, val0, val0, val0, dst, dst_stride);               \ 
  202     dst += (4 * dst_stride);                                    \ 
  203     SD4(val0, val0, val0, val0, dst, dst_stride);               \ 
  213     v8u16 sum_h, sum_top, sum_left;
 
  217     top = 
LD_UB(src_top);
 
  220     sum_h = sum_top + sum_left;
 
  221     sum_w = __msa_hadd_u_w(sum_h, sum_h);
 
  222     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  223     sum_w = (v4u32) __msa_pckev_w((v4i32) 
sum_d, (v4i32) 
sum_d);
 
  224     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  225     sum_w = (v4u32) __msa_srari_w((v4i32) 
sum_d, 5);
 
  226     out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
 
  229     dst += (8 * dst_stride);
 
  233 #define INTRA_DC_TL_16x16(dir)                                        \ 
  234 void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \ 
  235                              const uint8_t *left,                     \ 
  236                              const uint8_t *top)                      \ 
  244     sum_h = __msa_hadd_u_h(data, data);                               \ 
  245     sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \ 
  246     sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \ 
  247     sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \ 
  248     sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \ 
  249     sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \ 
  250     out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \ 
  252     ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \ 
  253     dst += (8 * dst_stride);                                          \ 
  254     ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \ 
  263     v16u8 top0, top1, left0, left1, 
out;
 
  264     v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
 
  268     LD_UB2(src_top, 16, top0, top1);
 
  269     LD_UB2(src_left, 16, left0, left1);
 
  272     sum_h = sum_top0 + sum_top1;
 
  273     sum_h += sum_left0 + sum_left1;
 
  274     sum_w = __msa_hadd_u_w(sum_h, sum_h);
 
  275     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  276     sum_w = (v4u32) __msa_pckev_w((v4i32) 
sum_d, (v4i32) 
sum_d);
 
  277     sum_d = __msa_hadd_u_d(sum_w, sum_w);
 
  278     sum_w = (v4u32) __msa_srari_w((v4i32) 
sum_d, 6);
 
  279     out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
 
  281     for (row = 16; row--;)
 
  290 #define INTRA_DC_TL_32x32(dir)                                    \ 
  291 void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \ 
  292                              const uint8_t *left,                 \ 
  293                              const uint8_t *top)                  \ 
  296     v16u8 data0, data1, out;                                      \ 
  297     v8u16 sum_h, sum_data0, sum_data1;                            \ 
  301     LD_UB2(dir, 16, data0, data1);                                \ 
  302     HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \ 
  303     sum_h = sum_data0 + sum_data1;                                \ 
  304     sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \ 
  305     sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \ 
  306     sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \ 
  307     sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \ 
  308     sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \ 
  309     out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \ 
  311     for (row = 16; row--;)                                        \ 
  313         ST_UB2(out, out, dst, 16);                                \ 
  315         ST_UB2(out, out, dst, 16);                                \ 
  322 #define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \ 
  323 void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \ 
  324                              const uint8_t *left, const uint8_t *top)  \ 
  326     v16u8 out = (v16u8) __msa_ldi_b(val);                              \ 
  328     ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \ 
  329     dst += (8 * dst_stride);                                           \ 
  330     ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \ 
  337 #define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \ 
  338 void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \ 
  339                              const uint8_t *left, const uint8_t *top)  \ 
  342     v16u8 out = (v16u8) __msa_ldi_b(val);                              \ 
  344     for (row = 16; row--;)                                             \ 
  346         ST_UB2(out, out, dst, 16);                                     \ 
  348         ST_UB2(out, out, dst, 16);                                     \ 
  361     uint8_t top_left = src_top_ptr[-1];
 
  362     v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
 
  364     v8u16 src_top_left, vec0, vec1, vec2, vec3;
 
  366     src_top_left = (v8u16) __msa_fill_h(top_left);
 
  367     src_top = 
LD_SB(src_top_ptr);
 
  369     src_left0 = __msa_fill_b(
left >> 24);
 
  370     src_left1 = __msa_fill_b(
left >> 16);
 
  371     src_left2 = __msa_fill_b(
left >> 8);
 
  372     src_left3 = __msa_fill_b(
left);
 
  374     ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
 
  375                src_left3, src_top, 
src0, 
src1, src2, src3);
 
  381     ST_W2(tmp0, 0, 2, dst, dst_stride);
 
  382     ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
 
  388     uint8_t top_left = src_top_ptr[-1];
 
  389     uint32_t loop_cnt, 
left;
 
  390     v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
 
  391     v8u16 src_top_left, vec0, vec1, vec2, vec3;
 
  394     src_top = 
LD_SB(src_top_ptr);
 
  395     src_top_left = (v8u16) __msa_fill_h(top_left);
 
  398     for (loop_cnt = 2; loop_cnt--;) {
 
  400         src_left0 = __msa_fill_b(
left >> 24);
 
  401         src_left1 = __msa_fill_b(
left >> 16);
 
  402         src_left2 = __msa_fill_b(
left >> 8);
 
  403         src_left3 = __msa_fill_b(
left);
 
  406         ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
 
  407                    src_left3, src_top, 
src0, 
src1, src2, src3);
 
  413         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
 
  414         dst += (4 * dst_stride);
 
  421     uint8_t top_left = src_top_ptr[-1];
 
  422     uint32_t loop_cnt, 
left;
 
  423     v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
 
  424     v8u16 src_top_left, res_r, res_l;
 
  426     src_top = 
LD_SB(src_top_ptr);
 
  427     src_top_left = (v8u16) __msa_fill_h(top_left);
 
  430     for (loop_cnt = 4; loop_cnt--;) {
 
  432         src_left0 = __msa_fill_b(
left >> 24);
 
  433         src_left1 = __msa_fill_b(
left >> 16);
 
  434         src_left2 = __msa_fill_b(
left >> 8);
 
  435         src_left3 = __msa_fill_b(
left);
 
  472     uint8_t top_left = src_top_ptr[-1];
 
  473     uint32_t loop_cnt, 
left;
 
  474     v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
 
  475     v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
 
  477     src_top0 = 
LD_SB(src_top_ptr);
 
  478     src_top1 = 
LD_SB(src_top_ptr + 16);
 
  479     src_top_left = (v8u16) __msa_fill_h(top_left);
 
  482     for (loop_cnt = 8; loop_cnt--;) {
 
  484         src_left0 = __msa_fill_b(
left >> 24);
 
  485         src_left1 = __msa_fill_b(
left >> 16);
 
  486         src_left2 = __msa_fill_b(
left >> 8);
 
  487         src_left3 = __msa_fill_b(
left);
 
  490         ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
 
  491         ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
 
  492         HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
 
  496         SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
 
  501         ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
 
  502         ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
 
  503         HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
 
  507         SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
 
  512         ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
 
  513         ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
 
  514         HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
 
  518         SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
 
  523         ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
 
  524         ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
 
  525         HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
 
  529         SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);