27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
34 v4i32 out0_r, out1_r, out0_l, out1_l; \
36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46 CLIP_SH2_0_255(out0, out1); \
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50 wgt, rnd, offset, out0, out1, out2, out3) \
52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
59 v4i32 out0_r, out1_r, out0_l, out1_l; \
61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69 CLIP_SH2_0_255(out0, out1); \
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73 vec3, wgt, rnd, offset, out0, out1, \
76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
109 offset_vec = __msa_fill_w(
offset);
110 weight_vec = (v8i16) __msa_fill_w(
weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
114 LW2(src0_ptr, src_stride, tp0, tp1);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
119 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129 ST_W2(out0, 0, 1, dst, dst_stride);
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142 }
else if (0 ==
height % 8) {
143 for (loop_cnt = (
height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
165 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166 dst += (8 * dst_stride);
186 uint64_t tp0, tp1, tp2, tp3;
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
192 v4i32 offset_vec, weight_vec, rnd_vec;
194 offset = (offset0 + offset1) << rnd_val;
195 weight0 = weight0 & 0x0000FFFF;
196 weight = weight0 | (weight1 << 16);
198 weight_vec = __msa_fill_w(
weight);
199 offset_vec = __msa_fill_w(
offset);
200 rnd_vec = __msa_fill_w(rnd_val + 1);
202 for (loop_cnt = (
height >> 2); loop_cnt--;) {
203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204 src0_ptr += (4 * src_stride);
207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208 src1_ptr += (4 * src2_stride);
211 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 weight_vec, rnd_vec, offset_vec,
215 dst0, dst1, dst2, dst3);
217 ST_W2(out0, 0, 2, dst, dst_stride);
218 ST_H2(out0, 2, 6, dst + 4, dst_stride);
219 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221 dst += (4 * dst_stride);
238 uint64_t tp0, tp1, tp2, tp3;
240 v16u8 out0, out1, out2;
242 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 };
243 v8i16 in0, in1, in2, in3, in4, in5;
244 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245 v4i32 offset_vec, weight_vec, rnd_vec;
247 offset = (offset0 + offset1) << rnd_val;
248 weight0 = weight0 & 0x0000FFFF;
249 weight = weight0 | (weight1 << 16);
251 offset_vec = __msa_fill_w(
offset);
252 weight_vec = __msa_fill_w(
weight);
253 rnd_vec = __msa_fill_w(rnd_val + 1);
256 LD2(src0_ptr, src_stride, tp0, tp1);
258 LD_SH2(src1_ptr, src2_stride, in0, in1);
263 weight_vec, rnd_vec, offset_vec,
266 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267 ST_D2(out0, 0, 1, dst, dst_stride);
269 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270 src0_ptr += 4 * src_stride;
273 LD2(src0_ptr, src_stride, tp0, tp1);
278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279 SLLI_4V(dst0, dst1, dst2, dst3, 6);
282 weight_vec, rnd_vec, offset_vec, dst0, dst1,
285 offset_vec, dst4, dst5);
286 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289 }
else if (0 ==
height % 4) {
292 for (loop_cnt = (
height >> 2); loop_cnt--;) {
293 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294 src0_ptr += (4 * src_stride);
299 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300 src1_ptr += (4 * src2_stride);
302 SLLI_4V(dst0, dst1, dst2, dst3, 6);
304 in3, weight_vec, rnd_vec, offset_vec,
305 dst0, dst1, dst2, dst3);
307 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308 dst += (4 * dst_stride);
329 v16u8 out0, out1, out2;
331 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333 v4i32 offset_vec, weight_vec, rnd_vec;
335 offset = (offset0 + offset1) << rnd_val;
336 weight0 = weight0 & 0x0000FFFF;
337 weight = weight0 | (weight1 << 16);
339 offset_vec = __msa_fill_w(
offset);
340 weight_vec = __msa_fill_w(
weight);
341 rnd_vec = __msa_fill_w(rnd_val + 1);
343 for (loop_cnt = (16 >> 2); loop_cnt--;) {
345 src0_ptr += (4 * src_stride);
346 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348 src1_ptr += (4 * src2_stride);
352 dst0, dst1, dst2, dst3);
354 SLLI_4V(dst0, dst1, dst2, dst3, 6);
361 weight_vec, rnd_vec, offset_vec, dst0, dst1,
364 offset_vec, dst4, dst5);
365 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368 dst += (4 * dst_stride);
387 v16u8 out0, out1, out2, out3;
390 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392 v4i32 offset_vec, weight_vec, rnd_vec;
394 offset = (offset0 + offset1) << rnd_val;
395 weight0 = weight0 & 0x0000FFFF;
396 weight = weight0 | (weight1 << 16);
398 offset_vec = __msa_fill_w(
offset);
399 weight_vec = __msa_fill_w(
weight);
400 rnd_vec = __msa_fill_w(rnd_val + 1);
402 for (loop_cnt = (
height >> 2); loop_cnt--;) {
404 src0_ptr += (4 * src_stride);
405 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407 src1_ptr += (4 * src2_stride);
408 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, tmp0, tmp1,
410 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, tmp4, tmp5,
412 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
415 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
418 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
422 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423 dst += (4 * dst_stride);
442 v16u8 out0, out1, out2, out3, out4, out5;
443 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
zero = { 0 };
444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446 v4i32 offset_vec, weight_vec, rnd_vec;
448 offset = (offset0 + offset1) << rnd_val;
449 weight0 = weight0 & 0x0000FFFF;
450 weight = weight0 | (weight1 << 16);
452 offset_vec = __msa_fill_w(
offset);
453 weight_vec = __msa_fill_w(
weight);
454 rnd_vec = __msa_fill_w(rnd_val + 1);
456 for (loop_cnt = 8; loop_cnt--;) {
458 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459 src0_ptr += (4 * src_stride);
460 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463 src1_ptr += (4 * src2_stride);
471 SLLI_4V(dst0, dst1, dst2, dst3, 6);
472 SLLI_4V(dst4, dst5, dst6, dst7, 6);
473 SLLI_4V(dst8, dst9, dst10, dst11, 6);
475 weight_vec, rnd_vec, offset_vec, dst0, dst1,
478 weight_vec, rnd_vec, offset_vec, dst4, dst5,
481 in11, weight_vec, rnd_vec, offset_vec,
482 dst8, dst9, dst10, dst11);
483 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487 dst += (4 * dst_stride);
506 v16u8 out0, out1, out2, out3;
509 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511 v4i32 offset_vec, weight_vec, rnd_vec;
513 offset = (offset0 + offset1) << rnd_val;
514 weight0 = weight0 & 0x0000FFFF;
515 weight = weight0 | (weight1 << 16);
517 offset_vec = __msa_fill_w(
offset);
518 weight_vec = __msa_fill_w(
weight);
519 rnd_vec = __msa_fill_w(rnd_val + 1);
521 for (loop_cnt = (
height >> 1); loop_cnt--;) {
523 src0_ptr += src_stride;
524 LD_SB2(src0_ptr, 16, src2, src3);
525 src0_ptr += src_stride;
526 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527 src1_ptr += src2_stride;
528 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529 src1_ptr += src2_stride;
535 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
538 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
541 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
545 ST_UB2(out0, out1, dst, 16);
547 ST_UB2(out2, out3, dst, 16);
567 v16u8 out0, out1, out2;
570 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571 v4i32 offset_vec, weight_vec, rnd_vec;
573 offset = (offset0 + offset1) << rnd_val;
574 weight0 = weight0 & 0x0000FFFF;
575 weight = weight0 | (weight1 << 16);
577 offset_vec = __msa_fill_w(
offset);
578 weight_vec = __msa_fill_w(
weight);
579 rnd_vec = __msa_fill_w(rnd_val + 1);
581 for (loop_cnt = 64; loop_cnt--;) {
583 src0_ptr += src_stride;
584 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585 src1_ptr += src2_stride;
590 SLLI_4V(dst0, dst1, dst2, dst3, 6);
593 weight_vec, rnd_vec, offset_vec, dst0, dst1,
596 offset_vec, dst4, dst5);
597 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598 ST_UB2(out0, out1, dst, 16);
599 ST_UB(out2, dst + 32);
619 v16u8 out0, out1, out2, out3;
622 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624 v4i32 offset_vec, weight_vec, rnd_vec;
626 offset = (offset0 + offset1) << rnd_val;
627 weight0 = weight0 & 0x0000FFFF;
628 weight = weight0 | (weight1 << 16);
630 offset_vec = __msa_fill_w(
offset);
631 weight_vec = __msa_fill_w(
weight);
632 rnd_vec = __msa_fill_w(rnd_val + 1);
634 for (loop_cnt =
height; loop_cnt--;) {
636 src0_ptr += src_stride;
637 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638 src1_ptr += src2_stride;
640 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, tmp0, tmp1,
642 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, tmp4, tmp5,
644 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
647 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
650 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
654 ST_UB4(out0, out1, out2, out3, dst, 16);
675 v8i16 filt0, filt1, filt2, filt3;
677 v16i8 mask1, mask2, mask3;
678 v16i8 vec0, vec1, vec2, vec3;
680 v8i16 in0, in1, in2, in3;
681 v8i16 filter_vec, out0, out1;
682 v4i32 weight_vec, offset_vec, rnd_vec;
687 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
693 offset = (offset0 + offset1) << rnd_val;
694 weight0 = weight0 & 0x0000FFFF;
695 weight = weight0 | (weight1 << 16);
696 constant = 128 * weight1;
700 offset_vec = __msa_fill_w(
offset);
701 weight_vec = __msa_fill_w(
weight);
702 rnd_vec = __msa_fill_w(rnd_val + 1);
704 for (loop_cnt = (
height >> 2); loop_cnt--;) {
706 src0_ptr += (4 * src_stride);
707 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708 src1_ptr += (4 * src2_stride);
713 vec0, vec1, vec2, vec3);
716 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717 vec0, vec1, vec2, vec3);
722 weight_vec, rnd_vec, offset_vec,
725 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727 dst += (4 * dst_stride);
747 v8i16 filt0, filt1, filt2, filt3;
749 v16i8 mask1, mask2, mask3;
750 v16i8 vec0, vec1, vec2, vec3;
751 v8i16 dst0, dst1, dst2, dst3;
752 v8i16 in0, in1, in2, in3;
753 v8i16 filter_vec, out0, out1, out2, out3;
754 v4i32 weight_vec, offset_vec, rnd_vec;
758 offset = (offset0 + offset1) << rnd_val;
759 weight0 = weight0 & 0x0000FFFF;
760 weight = weight0 | (weight1 << 16);
761 constant = 128 * weight1;
765 offset_vec = __msa_fill_w(
offset);
766 weight_vec = __msa_fill_w(
weight);
767 rnd_vec = __msa_fill_w(rnd_val + 1);
770 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
776 for (loop_cnt = (
height >> 2); loop_cnt--;) {
778 src0_ptr += (4 * src_stride);
779 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780 src1_ptr += (4 * src2_stride);
784 vec0, vec1, vec2, vec3);
788 vec0, vec1, vec2, vec3);
791 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792 vec0, vec1, vec2, vec3);
795 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796 vec0, vec1, vec2, vec3);
802 weight_vec, rnd_vec, offset_vec,
803 out0, out1, out2, out3);
806 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807 dst += (4 * dst_stride);
827 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3;
828 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831 v4i32 weight_vec, offset_vec, rnd_vec;
835 weight0 = weight0 & 0x0000FFFF;
836 weight = weight0 | (weight1 << 16);
837 constant = 128 * weight1;
839 offset = (offset0 + offset1) << rnd_val;
842 offset_vec = __msa_fill_w(
offset);
843 weight_vec = __msa_fill_w(
weight);
844 rnd_vec = __msa_fill_w(rnd_val + 1);
847 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
858 for (loop_cnt = 4; loop_cnt--;) {
860 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
870 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
874 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
879 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
882 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
885 src0_ptr += (4 * src_stride);
886 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887 src1_ptr += (4 * src2_stride);
894 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
899 offset_vec, out0, out1);
900 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902 dst += (4 * dst_stride);
923 v8i16 in0, in1, in2, in3;
924 v8i16 filt0, filt1, filt2, filt3;
925 v16i8 mask1, mask2, mask3;
926 v8i16 filter_vec, out0, out1, out2, out3;
927 v16i8 vec0, vec1, vec2, vec3;
928 v8i16 dst0, dst1, dst2, dst3;
929 v4i32 weight_vec, offset_vec, rnd_vec;
930 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
933 offset = (offset0 + offset1) << rnd_val;
934 weight0 = weight0 & 0x0000FFFF;
935 weight = weight0 | (weight1 << 16);
936 constant = 128 * weight1;
940 offset_vec = __msa_fill_w(
offset);
941 weight_vec = __msa_fill_w(
weight);
942 rnd_vec = __msa_fill_w(rnd_val + 1);
945 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
951 for (loop_cnt = (
height >> 1); loop_cnt--;) {
953 src0_ptr += src_stride;
954 LD_SB2(src0_ptr, 8, src2, src3);
955 src0_ptr += src_stride;
956 LD_SH2(src1_ptr, 8, in0, in1);
957 src1_ptr += src2_stride;
958 LD_SH2(src1_ptr, 8, in2, in3);
959 src1_ptr += src2_stride;
963 vec0, vec1, vec2, vec3);
967 vec0, vec1, vec2, vec3);
970 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971 vec0, vec1, vec2, vec3);
974 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975 vec0, vec1, vec2, vec3);
981 weight_vec, rnd_vec, offset_vec,
982 out0, out1, out2, out3);
985 ST_SH2(out0, out1, dst, dst_stride);
986 dst += (2 * dst_stride);
1008 v8i16 in0, in1, in2;
1009 v8i16 filt0, filt1, filt2, filt3;
1010 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011 v16i8 vec0, vec1, vec2, vec3;
1012 v8i16 dst0, dst1, dst2;
1013 v4i32 dst2_r, dst2_l;
1014 v8i16 filter_vec, out0, out1, out2;
1015 v4i32 weight_vec, offset_vec, rnd_vec;
1018 src0_ptr = src0_ptr - 3;
1019 offset = (offset0 + offset1) << rnd_val;
1020 weight0 = weight0 & 0x0000FFFF;
1021 weight = weight0 | (weight1 << 16);
1022 constant = 128 * weight1;
1026 offset_vec = __msa_fill_w(
offset);
1027 weight_vec = __msa_fill_w(
weight);
1028 rnd_vec = __msa_fill_w(rnd_val + 1);
1031 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1042 src0_ptr += src_stride;
1043 LD_SH2(src1_ptr, 8, in0, in1);
1044 in2 =
LD_SH(src1_ptr + 16);
1045 src1_ptr += src2_stride;
1048 for (loop_cnt = 31; loop_cnt--;) {
1050 vec0, vec1, vec2, vec3);
1054 vec0, vec1, vec2, vec3);
1058 vec0, vec1, vec2, vec3);
1063 weight_vec, rnd_vec, offset_vec,
1067 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068 (v8i16) weight_vec);
1069 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070 (v8i16) weight_vec);
1072 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1076 src0_ptr += src_stride;
1077 LD_SH2(src1_ptr, 8, in0, in1);
1078 in2 =
LD_SH(src1_ptr + 16);
1079 src1_ptr += src2_stride;
1082 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1084 SD(dst_val0, dst + 16);
1100 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1103 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1106 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1108 SD(dst_val0, dst + 16);
1129 v8i16 in0, in1, in2, in3;
1130 v8i16 filt0, filt1, filt2, filt3;
1132 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133 v16i8 vec0, vec1, vec2, vec3;
1134 v8i16 dst0, dst1, dst2, dst3;
1135 v8i16 filter_vec, out0, out1, out2, out3;
1136 v4i32 weight_vec, offset_vec, rnd_vec;
1139 offset = (offset0 + offset1) << rnd_val;
1140 weight0 = weight0 & 0x0000FFFF;
1141 weight = weight0 | (weight1 << 16);
1142 constant = 128 * weight1;
1146 offset_vec = __msa_fill_w(
offset);
1147 weight_vec = __msa_fill_w(
weight);
1148 rnd_vec = __msa_fill_w(rnd_val + 1);
1151 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1161 for (loop_cnt =
height; loop_cnt--;) {
1163 src2 =
LD_SB(src0_ptr + 24);
1164 src0_ptr += src_stride;
1165 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166 src1_ptr += src2_stride;
1171 vec0, vec1, vec2, vec3);
1175 vec0, vec1, vec2, vec3);
1179 vec0, vec1, vec2, vec3);
1182 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183 vec0, vec1, vec2, vec3);
1189 weight_vec, rnd_vec, offset_vec,
1190 out0, out1, out2, out3);
1193 ST_SH2(out0, out1, dst, 16);
1214 v16i8
src0,
src1, src2, src3, src4;
1215 v8i16 in0, in1, in2, in3;
1216 v8i16 filt0, filt1, filt2, filt3;
1218 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219 v16i8 vec0, vec1, vec2, vec3;
1220 v8i16 dst0, dst1, dst2, dst3;
1221 v8i16 filter_vec, out0, out1, out2, out3;
1222 v4i32 weight_vec, offset_vec, rnd_vec;
1225 offset = (offset0 + offset1) << rnd_val;
1226 weight0 = weight0 & 0x0000FFFF;
1227 weight = weight0 | (weight1 << 16);
1228 constant = 128 * weight1;
1232 offset_vec = __msa_fill_w(
offset);
1233 weight_vec = __msa_fill_w(
weight);
1234 rnd_vec = __msa_fill_w(rnd_val + 1);
1237 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1247 for (loop_cnt = 64; loop_cnt--;) {
1249 src2 =
LD_SB(src0_ptr + 24);
1250 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1252 LD_SB2(src0_ptr + 32, 8, src3, src4);
1253 src0_ptr += src_stride;
1257 vec0, vec1, vec2, vec3);
1261 vec0, vec1, vec2, vec3);
1265 vec0, vec1, vec2, vec3);
1268 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269 vec0, vec1, vec2, vec3);
1274 weight_vec, rnd_vec, offset_vec,
1275 out0, out1, out2, out3);
1278 ST_SH2(out0, out1, dst, 16);
1280 LD_SH2(src1_ptr + 32, 8, in2, in3);
1281 src1_ptr += src2_stride;
1283 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284 vec0, vec1, vec2, vec3);
1287 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288 vec0, vec1, vec2, vec3);
1293 weight_vec, rnd_vec, offset_vec,
1296 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297 ST_SH(out0, dst + 32);
1318 int16_t *src1_ptr_tmp;
1319 uint32_t loop_cnt, cnt;
1322 v8i16 in0, in1, in2, in3;
1323 v8i16 filt0, filt1, filt2, filt3;
1325 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326 v16i8 vec0, vec1, vec2, vec3;
1327 v8i16 dst0, dst1, dst2, dst3;
1328 v8i16 filter_vec, out0, out1, out2, out3;
1329 v4i32 weight_vec, offset_vec, rnd_vec;
1332 offset = (offset0 + offset1) << rnd_val;
1333 weight0 = weight0 & 0x0000FFFF;
1334 weight = weight0 | (weight1 << 16);
1335 constant = 128 * weight1;
1339 offset_vec = __msa_fill_w(
offset);
1340 weight_vec = __msa_fill_w(
weight);
1341 rnd_vec = __msa_fill_w(rnd_val + 1);
1344 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1354 for (loop_cnt =
height; loop_cnt--;) {
1355 src0_ptr_tmp = src0_ptr;
1357 src1_ptr_tmp = src1_ptr;
1359 for (cnt = 2; cnt--;) {
1361 src2 =
LD_SB(src0_ptr_tmp + 24);
1363 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1368 vec0, vec1, vec2, vec3);
1372 vec0, vec1, vec2, vec3);
1376 vec0, vec1, vec2, vec3);
1379 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380 vec0, vec1, vec2, vec3);
1386 weight_vec, rnd_vec, offset_vec,
1387 out0, out1, out2, out3);
1390 ST_SH2(out0, out1, dst_tmp, 16);
1394 src0_ptr += src_stride;
1395 src1_ptr += src2_stride;
1417 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418 v16i8 src11, src12, src13, src14;
1419 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423 v16i8 src2110, src4332, src6554, src8776, src10998;
1424 v16i8 src12111110, src14131312;
1425 v8i16 dst10, dst32, dst54, dst76;
1426 v8i16 filt0, filt1, filt2, filt3;
1427 v8i16 filter_vec, out0, out1, out2, out3;
1428 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1430 src0_ptr -= (3 * src_stride);
1431 offset = (offset0 + offset1) << rnd_val;
1432 weight0 = weight0 & 0x0000FFFF;
1433 weight = weight0 | (weight1 << 16);
1435 const_vec = __msa_ldi_w(128);
1437 offset_vec = __msa_fill_w(
offset);
1438 weight_vec = __msa_fill_w(
weight);
1439 rnd_vec = __msa_fill_w(rnd_val + 1);
1440 weight1_vec = __msa_fill_w(weight1);
1441 offset_vec += const_vec * weight1_vec;
1444 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1446 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1447 src0_ptr += (7 * src_stride);
1450 src10_r, src32_r, src54_r, src21_r);
1451 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453 src2110, src4332, src6554);
1456 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1457 LD_SB8(src0_ptr, src_stride,
1458 src7, src8, src9, src10, src11, src12, src13, src14);
1459 src0_ptr += (8 * src_stride);
1460 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461 src1_ptr += (8 * src2_stride);
1465 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466 src76_r, src87_r, src98_r, src109_r);
1467 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468 src1110_r, src1211_r, src1312_r, src1413_r);
1469 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470 src1413_r, src1312_r,
1471 src8776, src10998, src12111110, src14131312);
1474 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475 filt0, dst10, dst32, dst54, dst76);
1476 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477 filt1, dst10, dst32, dst54, dst76);
1478 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479 filt2, filt2, dst10, dst32, dst54, dst76);
1480 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481 filt3, filt3, dst10, dst32, dst54, dst76);
1485 weight_vec, rnd_vec, offset_vec,
1486 out0, out1, out2, out3);
1489 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490 dst += (8 * dst_stride);
1493 src4332 = src12111110;
1494 src6554 = src14131312;
1515 v16i8
src0,
src1, src2, src3, src4, src5;
1516 v16i8 src6, src7, src8, src9, src10;
1517 v8i16 in0, in1, in2, in3;
1518 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520 v8i16 tmp0, tmp1, tmp2, tmp3;
1521 v8i16 filt0, filt1, filt2, filt3;
1522 v8i16 filter_vec, out0, out1, out2, out3;
1523 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1525 src0_ptr -= (3 * src_stride);
1526 offset = (offset0 + offset1) << rnd_val;
1527 weight0 = weight0 & 0x0000FFFF;
1528 weight = weight0 | (weight1 << 16);
1530 const_vec = __msa_ldi_w(128);
1532 offset_vec = __msa_fill_w(
offset);
1533 weight_vec = __msa_fill_w(
weight);
1534 rnd_vec = __msa_fill_w(rnd_val + 1);
1535 weight1_vec = __msa_fill_w(weight1);
1536 offset_vec += const_vec * weight1_vec;
1539 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1541 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1542 src0_ptr += (7 * src_stride);
1546 src10_r, src32_r, src54_r, src21_r);
1547 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1549 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1550 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551 src0_ptr += (4 * src_stride);
1552 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553 src1_ptr += (4 * src2_stride);
1556 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557 src76_r, src87_r, src98_r, src109_r);
1559 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560 filt0, tmp0, tmp1, tmp2, tmp3);
1561 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562 filt1, tmp0, tmp1, tmp2, tmp3);
1563 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564 filt2, tmp0, tmp1, tmp2, tmp3);
1565 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566 filt3, tmp0, tmp1, tmp2, tmp3);
1570 weight_vec, rnd_vec, offset_vec,
1571 out0, out1, out2, out3);
1574 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575 dst += (4 * dst_stride);
1603 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1604 v8i16 in0, in1, in2, in3;
1605 v16i8 src10_r, src32_r, src54_r, src76_r;
1606 v16i8 src21_r, src43_r, src65_r, src87_r;
1607 v8i16 tmp0, tmp1, tmp2;
1608 v16i8 src10_l, src32_l, src54_l, src76_l;
1609 v16i8 src21_l, src43_l, src65_l, src87_l;
1610 v16i8 src2110, src4332, src6554, src8776;
1611 v8i16 filt0, filt1, filt2, filt3;
1612 v8i16 out0, out1, out2, filter_vec;
1613 v4i32 dst2_r, dst2_l;
1614 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1616 src0_ptr -= (3 * src_stride);
1617 offset = (offset0 + offset1) << rnd_val;
1618 weight0 = weight0 & 0x0000FFFF;
1619 weight = weight0 | (weight1 << 16);
1621 const_vec = __msa_ldi_w(128);
1623 offset_vec = __msa_fill_w(
offset);
1624 weight_vec = __msa_fill_w(
weight);
1625 rnd_vec = __msa_fill_w(rnd_val + 1);
1626 weight1_vec = __msa_fill_w(weight1);
1627 offset_vec += const_vec * weight1_vec;
1630 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1632 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1633 src0_ptr += (7 * src_stride);
1637 src10_r, src32_r, src54_r, src21_r);
1638 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1640 src10_l, src32_l, src54_l, src21_l);
1641 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643 src2110, src4332, src6554);
1645 for (loop_cnt = 8; loop_cnt--;) {
1646 LD_SB2(src0_ptr, src_stride, src7, src8);
1647 src0_ptr += (2 * src_stride);
1648 LD_SH2(src1_ptr, src2_stride, in0, in1);
1649 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650 src1_ptr += (2 * src2_stride);
1651 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1654 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1658 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1660 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1668 weight_vec, rnd_vec, offset_vec,
1672 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673 (v8i16) weight_vec);
1674 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675 (v8i16) weight_vec);
1677 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1680 ST_D2(out0, 0, 1, dst, dst_stride);
1681 ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682 dst += (2 * dst_stride);
1713 int16_t *src1_ptr_tmp;
1715 uint32_t loop_cnt, cnt;
1717 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1718 v8i16 in0, in1, in2, in3;
1719 v16i8 src10_r, src32_r, src54_r, src76_r;
1720 v16i8 src21_r, src43_r, src65_r, src87_r;
1721 v16i8 src10_l, src32_l, src54_l, src76_l;
1722 v16i8 src21_l, src43_l, src65_l, src87_l;
1723 v8i16 tmp0, tmp1, tmp2, tmp3;
1724 v8i16 filt0, filt1, filt2, filt3;
1726 v8i16 out0, out1, out2, out3;
1727 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1729 src0_ptr -= (3 * src_stride);
1731 offset = (offset0 + offset1) << rnd_val;
1732 weight0 = weight0 & 0x0000FFFF;
1733 weight = weight0 | (weight1 << 16);
1735 const_vec = __msa_ldi_w(128);
1737 offset_vec = __msa_fill_w(
offset);
1738 weight_vec = __msa_fill_w(
weight);
1739 rnd_vec = __msa_fill_w(rnd_val + 1);
1740 weight1_vec = __msa_fill_w(weight1);
1741 offset_vec += const_vec * weight1_vec;
1744 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1746 for (cnt = (
width >> 4); cnt--;) {
1747 src0_ptr_tmp = src0_ptr;
1748 src1_ptr_tmp = src1_ptr;
1751 LD_SB7(src0_ptr_tmp, src_stride,
1752 src0,
src1, src2, src3, src4, src5, src6);
1753 src0_ptr_tmp += (7 * src_stride);
1757 src10_r, src32_r, src54_r, src21_r);
1758 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1760 src10_l, src32_l, src54_l, src21_l);
1761 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1763 for (loop_cnt = (
height >> 1); loop_cnt--;) {
1764 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765 src0_ptr_tmp += (2 * src_stride);
1766 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768 src1_ptr_tmp += (2 * src2_stride);
1771 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1774 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1785 weight_vec, rnd_vec, offset_vec,
1786 out0, out1, out2, out3);
1789 ST_SH2(out0, out1, dst_tmp, dst_stride);
1790 dst_tmp += (2 * dst_stride);
1828 src1_ptr, src2_stride,
1830 weight0, weight1, offset0, offset1,
1849 src1_ptr, src2_stride,
1851 weight0, weight1, offset0, offset1,
1854 src1_ptr + 16, src2_stride,
1856 weight0, weight1, offset0, offset1, rnd_val);
1874 src1_ptr, src2_stride,
1876 weight0, weight1, offset0, offset1,
1895 src1_ptr, src2_stride,
1897 weight0, weight1, offset0, offset1,
1916 src1_ptr, src2_stride,
1918 weight0, weight1, offset0, offset1,
1928 const int8_t *filter_x,
1929 const int8_t *filter_y,
1941 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942 v8i16 in0 = { 0 }, in1 = { 0 };
1943 v8i16 filt0, filt1, filt2, filt3;
1944 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945 v16i8 mask1, mask2, mask3;
1946 v8i16 filter_vec, weight_vec;
1947 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950 v8i16 tmp0, tmp1, tmp2, tmp3;
1951 v8i16 dst10, dst32, dst54, dst76;
1952 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1956 src0_ptr -= ((3 * src_stride) + 3);
1958 filter_vec =
LD_SH(filter_x);
1959 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1961 filter_vec =
LD_SH(filter_y);
1964 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1970 offset = (offset0 + offset1) << rnd_val;
1971 weight0 = weight0 & 0x0000FFFF;
1972 weight = weight0 | (weight1 << 16);
1974 const_vec = __msa_fill_w((128 * weight1));
1976 offset_vec = __msa_fill_w(
offset);
1977 rnd_vec = __msa_fill_w(rnd_val + 1);
1978 offset_vec += const_vec;
1979 weight_vec = (v8i16) __msa_fill_w(
weight);
1981 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1982 src0_ptr += (7 * src_stride);
1986 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989 vec8, vec9, vec10, vec11);
1990 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991 vec12, vec13, vec14, vec15);
2006 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2008 for (loop_cnt =
height >> 2; loop_cnt--;) {
2009 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010 src0_ptr += (4 * src_stride);
2013 LD2(src1_ptr, src2_stride, tp0, tp1);
2015 src1_ptr += (2 * src2_stride);
2016 LD2(src1_ptr, src2_stride, tp0, tp1);
2018 src1_ptr += (2 * src2_stride);
2020 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021 vec0, vec1, vec2, vec3);
2022 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023 vec4, vec5, vec6, vec7);
2029 dst76 = __msa_ilvr_h(dst97, dst66);
2031 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032 dst98 = __msa_ilvr_h(dst66, dst108);
2034 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2036 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2038 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2040 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2042 SRA_4V(dst0, dst1, dst2, dst3, 6);
2046 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2053 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2055 dst += (4 * dst_stride);
2063 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2073 const int8_t *filter_x,
2074 const int8_t *filter_y,
2083 uint32_t loop_cnt, cnt;
2086 int16_t *src1_ptr_tmp;
2089 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2091 v8i16 filt0, filt1, filt2, filt3;
2092 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2094 v16i8 mask1, mask2, mask3;
2095 v8i16 filter_vec, weight_vec;
2096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100 v8i16 tmp0, tmp1, tmp2, tmp3;
2101 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105 v4i32 offset_vec, rnd_vec, const_vec;
2107 src0_ptr -= ((3 * src_stride) + 3);
2109 offset = (offset0 + offset1) << rnd_val;
2110 weight0 = weight0 & 0x0000FFFF;
2111 weight = weight0 | (weight1 << 16);
2113 const_vec = __msa_fill_w((128 * weight1));
2115 offset_vec = __msa_fill_w(
offset);
2116 rnd_vec = __msa_fill_w(rnd_val + 1);
2117 offset_vec += const_vec;
2118 weight_vec = (v8i16) __msa_fill_w(
weight);
2120 filter_vec =
LD_SH(filter_x);
2121 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2123 filter_vec =
LD_SH(filter_y);
2126 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2132 for (cnt = width8mult; cnt--;) {
2133 src0_ptr_tmp = src0_ptr;
2134 src1_ptr_tmp = src1_ptr;
2137 LD_SB7(src0_ptr_tmp, src_stride,
2138 src0,
src1, src2, src3, src4, src5, src6);
2139 src0_ptr_tmp += (7 * src_stride);
2145 vec0, vec1, vec2, vec3);
2147 vec4, vec5, vec6, vec7);
2148 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149 vec8, vec9, vec10, vec11);
2150 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151 vec12, vec13, vec14, vec15);
2163 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164 vec0, vec1, vec2, vec3);
2165 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166 vec4, vec5, vec6, vec7);
2167 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168 vec8, vec9, vec10, vec11);
2177 for (loop_cnt =
height >> 1; loop_cnt--;) {
2178 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2180 src0_ptr_tmp += 2 * src_stride;
2182 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183 src1_ptr_tmp += (2 * src2_stride);
2185 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186 dst32_r, dst54_r, dst21_r);
2187 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188 dst32_l, dst54_l, dst21_l);
2189 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2192 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193 vec0, vec1, vec2, vec3);
2199 filt_h0, filt_h1, filt_h2, filt_h3);
2201 filt_h0, filt_h1, filt_h2, filt_h3);
2207 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208 vec0, vec1, vec2, vec3);
2214 filt_h0, filt_h1, filt_h2, filt_h3);
2216 filt_h0, filt_h1, filt_h2, filt_h3);
2221 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2224 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2230 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2233 dst_tmp += (2 * dst_stride);
2256 const int8_t *filter_x,
2257 const int8_t *filter_y,
2266 src1_ptr, src2_stride,
2267 dst, dst_stride, filter_x, filter_y,
2268 height, weight0, weight1, offset0,
2269 offset1, rnd_val, 1);
2278 const int8_t *filter_x,
2279 const int8_t *filter_y,
2288 uint8_t *src0_ptr_tmp, *dst_tmp;
2289 int16_t *src1_ptr_tmp;
2293 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297 v8i16 in0 = { 0 }, in1 = { 0 };
2298 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2307 src0_ptr -= ((3 * src_stride) + 3);
2309 offset = (offset0 + offset1) << rnd_val;
2310 weight0 = weight0 & 0x0000FFFF;
2311 weight = weight0 | (weight1 << 16);
2313 const_vec = __msa_fill_w((128 * weight1));
2315 offset_vec = __msa_fill_w(
offset);
2316 rnd_vec = __msa_fill_w(rnd_val + 1);
2317 offset_vec += const_vec;
2318 weight_vec = (v8i16) __msa_fill_w(
weight);
2320 filter_vec =
LD_SH(filter_x);
2321 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2323 filter_vec =
LD_SH(filter_y);
2326 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2333 src0_ptr_tmp = src0_ptr;
2334 src1_ptr_tmp = src1_ptr;
2337 LD_SB7(src0_ptr_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
2338 src0_ptr_tmp += (7 * src_stride);
2343 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2345 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2355 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2366 for (loop_cnt = 8; loop_cnt--;) {
2367 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368 src0_ptr_tmp += (2 * src_stride);
2371 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372 src1_ptr_tmp += (2 * src2_stride);
2374 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375 dst10_r, dst32_r, dst54_r, dst21_r);
2376 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377 dst10_l, dst32_l, dst54_l, dst21_l);
2378 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2381 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2387 dst0 =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388 filt_h1, filt_h2, filt_h3);
2389 dst1 =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390 filt_h1, filt_h2, filt_h3);
2394 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2400 dst2 =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401 filt_h1, filt_h2, filt_h3);
2402 dst3 =
HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403 filt_h1, filt_h2, filt_h3);
2410 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2417 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418 ST_D2(
out, 0, 1, dst_tmp, dst_stride);
2419 dst_tmp += (2 * dst_stride);
2439 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
2440 src0_ptr += (7 * src_stride);
2443 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2447 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2461 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2463 for (loop_cnt = 4; loop_cnt--;) {
2464 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465 src0_ptr += (4 * src_stride);
2468 LD2(src1_ptr, src2_stride, tp0, tp1);
2470 src1_ptr += (2 * src2_stride);
2471 LD2(src1_ptr, src2_stride, tp0, tp1);
2473 src1_ptr += (2 * src2_stride);
2475 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2477 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2484 dst76 = __msa_ilvr_h(dst97, dst66);
2486 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487 dst98 = __msa_ilvr_h(dst66, dst108);
2489 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2491 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2493 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2495 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2497 SRA_4V(dst0, dst1, dst2, dst3, 6);
2501 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2508 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2510 dst += (4 * dst_stride);
2518 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2528 const int8_t *filter_x,
2529 const int8_t *filter_y,
2538 src1_ptr, src2_stride,
2539 dst, dst_stride, filter_x, filter_y,
2540 height, weight0, weight1, offset0,
2541 offset1, rnd_val, 2);
2550 const int8_t *filter_x,
2551 const int8_t *filter_y,
2560 src1_ptr, src2_stride,
2561 dst, dst_stride, filter_x, filter_y,
2562 height, weight0, weight1, offset0,
2563 offset1, rnd_val, 3);
2572 const int8_t *filter_x,
2573 const int8_t *filter_y,
2582 src1_ptr, src2_stride,
2583 dst, dst_stride, filter_x, filter_y,
2584 height, weight0, weight1, offset0,
2585 offset1, rnd_val, 4);
2594 const int8_t *filter_x,
2595 const int8_t *filter_y,
2604 src1_ptr, src2_stride,
2605 dst, dst_stride, filter_x, filter_y,
2606 height, weight0, weight1, offset0,
2607 offset1, rnd_val, 6);
2616 const int8_t *filter_x,
2617 const int8_t *filter_y,
2626 src1_ptr, src2_stride,
2627 dst, dst_stride, filter_x, filter_y,
2628 height, weight0, weight1, offset0,
2629 offset1, rnd_val, 8);
2650 v16i8 mask1, vec0, vec1;
2652 v4i32 dst0_r, dst0_l;
2653 v8i16 out0, filter_vec;
2654 v4i32 weight_vec, offset_vec, rnd_vec;
2663 offset = (offset0 + offset1) << rnd_val;
2664 weight0 = weight0 & 0x0000FFFF;
2665 weight = weight0 | (weight1 << 16);
2666 constant = 128 * weight1;
2670 offset_vec = __msa_fill_w(
offset);
2671 weight_vec = __msa_fill_w(
weight);
2672 rnd_vec = __msa_fill_w(rnd_val + 1);
2675 LD_SH2(src1_ptr, src2_stride, in0, in1);
2676 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2683 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2686 out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2688 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689 ST_W2(out0, 0, 1, dst, dst_stride);
2712 v8i16 in0, in1, in2, in3;
2714 v4i32 weight_vec, offset_vec, rnd_vec;
2724 offset = (offset0 + offset1) << rnd_val;
2725 weight0 = weight0 & 0x0000FFFF;
2726 weight = weight0 | (weight1 << 16);
2727 constant = 128 * weight1;
2731 offset_vec = __msa_fill_w(
offset);
2732 weight_vec = __msa_fill_w(
weight);
2733 rnd_vec = __msa_fill_w(rnd_val + 1);
2737 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2742 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2745 weight_vec, rnd_vec, offset_vec,
2748 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2769 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2773 v8i16 dst0, dst1, dst2, dst3;
2774 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2776 v4i32 weight_vec, offset_vec, rnd_vec;
2783 offset = (offset0 + offset1) << rnd_val;
2784 weight0 = weight0 & 0x0000FFFF;
2785 weight = weight0 | (weight1 << 16);
2786 constant = 128 * weight1;
2790 offset_vec = __msa_fill_w(
offset);
2791 weight_vec = __msa_fill_w(
weight);
2792 rnd_vec = __msa_fill_w(rnd_val + 1);
2796 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2797 LD_SB8(src0_ptr, src_stride,
2798 src0,
src1, src2, src3, src4, src5, src6, src7);
2799 src0_ptr += (8 * src_stride);
2800 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801 src1_ptr += (4 * src2_stride);
2802 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803 src1_ptr += (4 * src2_stride);
2810 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2812 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2814 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2818 weight_vec, rnd_vec, offset_vec,
2819 dst0, dst1, dst2, dst3);
2822 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823 dst += (8 * dst_stride);
2844 weight0, weight1, offset0, offset1, rnd_val);
2845 }
else if (4 ==
height) {
2848 weight0, weight1, offset0, offset1, rnd_val);
2849 }
else if (0 == (
height % 8)) {
2851 src1_ptr, src2_stride,
2853 weight0, weight1, offset0, offset1,
2879 v8i16 in0, in1, in2, in3;
2880 v8i16 dst0, dst1, dst2, dst3;
2882 v4i32 weight_vec, offset_vec, rnd_vec;
2889 offset = (offset0 + offset1) << rnd_val;
2890 weight0 = weight0 & 0x0000FFFF;
2891 weight = weight0 | (weight1 << 16);
2892 constant = 128 * weight1;
2896 offset_vec = __msa_fill_w(
offset);
2897 weight_vec = __msa_fill_w(
weight);
2898 rnd_vec = __msa_fill_w(rnd_val + 1);
2902 for (loop_cnt = 2; loop_cnt--;) {
2904 src0_ptr += (4 * src_stride);
2905 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906 src1_ptr += (4 * src2_stride);
2913 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2915 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2920 weight_vec, rnd_vec, offset_vec,
2921 dst0, dst1, dst2, dst3);
2924 ST_W2(dst0, 0, 2, dst, dst_stride);
2925 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928 dst += (4 * dst_stride);
2950 v16i8 mask1, vec0, vec1;
2953 v4i32 weight_vec, offset_vec, rnd_vec;
2960 offset = (offset0 + offset1) << rnd_val;
2961 weight0 = weight0 & 0x0000FFFF;
2962 weight = weight0 | (weight1 << 16);
2963 constant = 128 * weight1;
2967 offset_vec = __msa_fill_w(
offset);
2968 weight_vec = __msa_fill_w(
weight);
2969 rnd_vec = __msa_fill_w(rnd_val + 1);
2974 LD_SH2(src1_ptr, src2_stride, in0, in1);
2981 weight_vec, rnd_vec, offset_vec,
2984 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985 ST_D2(dst0, 0, 1, dst, dst_stride);
3003 v16i8
src0,
src1, src2, src3, src4, src5;
3004 v8i16 in0, in1, in2, in3, in4, in5;
3008 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3010 v4i32 weight_vec, offset_vec, rnd_vec;
3017 offset = (offset0 + offset1) << rnd_val;
3018 weight0 = weight0 & 0x0000FFFF;
3019 weight = weight0 | (weight1 << 16);
3020 constant = 128 * weight1;
3024 offset_vec = __msa_fill_w(
offset);
3025 weight_vec = __msa_fill_w(
weight);
3026 rnd_vec = __msa_fill_w(rnd_val + 1);
3030 LD_SB6(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5);
3032 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033 src1_ptr += (4 * src2_stride);
3034 LD_SH2(src1_ptr, src2_stride, in4, in5);
3040 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3042 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3044 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3046 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3050 weight_vec, rnd_vec, offset_vec,
3051 dst0, dst1, dst2, dst3);
3053 weight_vec, rnd_vec, offset_vec,
3057 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059 ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3083 v8i16 in0, in1, in2, in3;
3084 v8i16 dst0, dst1, dst2, dst3;
3086 v4i32 weight_vec, offset_vec, rnd_vec;
3093 offset = (offset0 + offset1) << rnd_val;
3094 weight0 = weight0 & 0x0000FFFF;
3095 weight = weight0 | (weight1 << 16);
3096 constant = 128 * weight1;
3100 offset_vec = __msa_fill_w(
offset);
3101 weight_vec = __msa_fill_w(
weight);
3102 rnd_vec = __msa_fill_w(rnd_val + 1);
3106 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3108 src0_ptr += (4 * src_stride);
3109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110 src1_ptr += (4 * src2_stride);
3117 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3119 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3123 weight_vec, rnd_vec, offset_vec,
3124 dst0, dst1, dst2, dst3);
3127 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128 dst += (4 * dst_stride);
3149 weight0, weight1, offset0, offset1, rnd_val);
3150 }
else if (6 ==
height) {
3153 weight0, weight1, offset0, offset1, rnd_val);
3154 }
else if (0 == (
height % 4)) {
3156 src1_ptr, src2_stride,
3158 weight0, weight1, offset0, offset1,
3181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3184 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3190 v4i32 weight_vec, offset_vec, rnd_vec;
3197 offset = (offset0 + offset1) << rnd_val;
3198 weight0 = weight0 & 0x0000FFFF;
3199 weight = weight0 | (weight1 << 16);
3200 constant = 128 * weight1;
3204 offset_vec = __msa_fill_w(
offset);
3205 weight_vec = __msa_fill_w(
weight);
3206 rnd_vec = __msa_fill_w(rnd_val + 1);
3211 for (loop_cnt = 4; loop_cnt--;) {
3213 src0_ptr += (4 * src_stride);
3214 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216 src1_ptr += (4 * src2_stride);
3224 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3226 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3230 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3235 weight_vec, rnd_vec, offset_vec,
3236 dst0, dst1, dst2, dst3);
3238 weight_vec, rnd_vec, offset_vec,
3242 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244 ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245 dst += (4 * dst_stride);
3265 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
3266 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3270 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3273 v4i32 weight_vec, offset_vec, rnd_vec;
3280 offset = (offset0 + offset1) << rnd_val;
3281 weight0 = weight0 & 0x0000FFFF;
3282 weight = weight0 | (weight1 << 16);
3283 constant = 128 * weight1;
3287 offset_vec = __msa_fill_w(
offset);
3288 weight_vec = __msa_fill_w(
weight);
3289 rnd_vec = __msa_fill_w(rnd_val + 1);
3293 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3294 LD_SB4(src0_ptr, src_stride,
src0, src2, src4, src6);
3295 LD_SB4(src0_ptr + 8, src_stride,
src1, src3, src5, src7);
3296 src0_ptr += (4 * src_stride);
3297 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299 src1_ptr += (4 * src2_stride);
3306 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3308 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3310 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3312 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3314 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3316 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3320 weight_vec, rnd_vec, offset_vec,
3321 dst0, dst1, dst2, dst3);
3324 ST_SH2(dst0, dst1, dst, dst_stride);
3325 dst += (2 * dst_stride);
3329 weight_vec, rnd_vec, offset_vec,
3330 dst0, dst1, dst2, dst3);
3333 ST_SH2(dst0, dst1, dst, dst_stride);
3334 dst += (2 * dst_stride);
3357 v16i8 mask1, mask2, mask3;
3359 v8i16 dst0, dst1, dst2, dst3;
3360 v8i16 in0, in1, in2, in3, in4, in5;
3362 v4i32 weight_vec, offset_vec, rnd_vec;
3369 offset = (offset0 + offset1) << rnd_val;
3370 weight0 = weight0 & 0x0000FFFF;
3371 weight = weight0 | (weight1 << 16);
3372 constant = 128 * weight1;
3376 offset_vec = __msa_fill_w(
offset);
3377 weight_vec = __msa_fill_w(
weight);
3378 rnd_vec = __msa_fill_w(rnd_val + 1);
3384 for (loop_cnt = 16; loop_cnt--;) {
3386 LD_SB2(src0_ptr + 16, src_stride,
src1, src3);
3387 src0_ptr += (2 * src_stride);
3388 LD_SH2(src1_ptr, src2_stride, in0, in2);
3389 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391 src1_ptr += (2 * src2_stride);
3398 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3400 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3404 weight_vec, rnd_vec, offset_vec,
3405 dst0, dst1, dst2, dst3);
3408 ST_SH2(dst0, dst1, dst, dst_stride);
3413 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3416 weight_vec, rnd_vec, offset_vec,
3419 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420 ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421 dst += (2 * dst_stride);
3444 v16i8 mask1, mask2, mask3;
3445 v8i16 dst0, dst1, dst2, dst3;
3447 v8i16 in0, in1, in2, in3;
3449 v4i32 weight_vec, offset_vec, rnd_vec;
3456 offset = (offset0 + offset1) << rnd_val;
3457 weight0 = weight0 & 0x0000FFFF;
3458 weight = weight0 | (weight1 << 16);
3459 constant = 128 * weight1;
3463 offset_vec = __msa_fill_w(
offset);
3464 weight_vec = __msa_fill_w(
weight);
3465 rnd_vec = __msa_fill_w(rnd_val + 1);
3471 for (loop_cnt =
height; loop_cnt--;) {
3473 src2 =
LD_SB(src0_ptr + 24);
3474 src0_ptr += src_stride;
3475 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476 src1_ptr += src2_stride;
3485 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3489 weight_vec, rnd_vec, offset_vec,
3490 dst0, dst1, dst2, dst3);
3493 ST_SH2(dst0, dst1, dst, 16);
3512 v16i8
src0,
src1, src2, src3, src4;
3513 v8i16 in0, in1, dst10;
3514 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515 v4i32 dst10_r, dst10_l;
3517 v8i16 filter_vec,
out;
3518 v4i32 weight_vec, offset_vec, rnd_vec;
3520 src0_ptr -= src_stride;
3522 offset = (offset0 + offset1) << rnd_val;
3523 weight0 = weight0 & 0x0000FFFF;
3524 weight = weight0 | (weight1 << 16);
3525 constant = 128 * weight1;
3529 offset_vec = __msa_fill_w(
offset);
3530 weight_vec = __msa_fill_w(
weight);
3531 rnd_vec = __msa_fill_w(rnd_val + 1);
3537 src0_ptr += (3 * src_stride);
3539 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541 LD_SB2(src0_ptr, src_stride, src3, src4);
3542 src0_ptr += (2 * src_stride);
3543 LD_SH2(src1_ptr, src2_stride, in0, in1);
3544 src1_ptr += (2 * src2_stride);
3546 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3554 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3557 out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3559 out = (v8i16) __msa_pckev_b((v16i8)
out, (v16i8)
out);
3577 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3578 v8i16 in0, in1, in2, in3;
3579 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580 v16i8 src2110, src4332, src6554;
3584 v4i32 weight_vec, offset_vec, rnd_vec;
3586 src0_ptr -= src_stride;
3588 offset = (offset0 + offset1) << rnd_val;
3589 weight0 = weight0 & 0x0000FFFF;
3590 weight = weight0 | (weight1 << 16);
3591 constant = 128 * weight1;
3595 offset_vec = __msa_fill_w(
offset);
3596 weight_vec = __msa_fill_w(
weight);
3597 rnd_vec = __msa_fill_w(rnd_val + 1);
3603 src0_ptr += (3 * src_stride);
3605 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3608 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609 src0_ptr += (4 * src_stride);
3610 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611 src1_ptr += (4 * src2_stride);
3613 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614 src32_r, src43_r, src54_r, src65_r);
3615 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3622 weight_vec, rnd_vec, offset_vec,
3625 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627 dst += (4 * dst_stride);
3646 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650 v16i8 src2110, src4332, src6554, src8776;
3651 v8i16 dst10, dst32, dst54, dst76;
3654 v4i32 weight_vec, offset_vec, rnd_vec;
3656 src0_ptr -= src_stride;
3658 offset = (offset0 + offset1) << rnd_val;
3659 weight0 = weight0 & 0x0000FFFF;
3660 weight = weight0 | (weight1 << 16);
3661 constant = 128 * weight1;
3665 offset_vec = __msa_fill_w(
offset);
3666 weight_vec = __msa_fill_w(
weight);
3667 rnd_vec = __msa_fill_w(rnd_val + 1);
3673 src0_ptr += (3 * src_stride);
3675 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3678 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3679 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680 src0_ptr += (6 * src_stride);
3681 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682 src1_ptr += (8 * src2_stride);
3687 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688 src32_r, src43_r, src54_r, src65_r);
3689 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691 src4332, src6554, src8776);
3698 LD_SB2(src0_ptr, src_stride, src9, src2);
3699 src0_ptr += (2 * src_stride);
3700 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3707 weight_vec, rnd_vec, offset_vec,
3708 dst10, dst32, dst54, dst76);
3710 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711 ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712 dst += (8 * dst_stride);
3733 weight0, weight1, offset0, offset1, rnd_val);
3734 }
else if (4 ==
height) {
3737 weight0, weight1, offset0, offset1, rnd_val);
3738 }
else if (0 == (
height % 8)) {
3740 src1_ptr, src2_stride,
3742 weight0, weight1, offset0, offset1,
3763 v16i8
src0,
src1, src2, src3, src4;
3764 v8i16 in0, in1, in2, in3;
3765 v16i8 src10_r, src32_r, src21_r, src43_r;
3766 v8i16 tmp0, tmp1, tmp2, tmp3;
3769 v4i32 weight_vec, offset_vec, rnd_vec;
3771 src0_ptr -= src_stride;
3773 offset = (offset0 + offset1) << rnd_val;
3774 weight0 = weight0 & 0x0000FFFF;
3775 weight = weight0 | (weight1 << 16);
3776 constant = 128 * weight1;
3780 offset_vec = __msa_fill_w(
offset);
3781 weight_vec = __msa_fill_w(
weight);
3782 rnd_vec = __msa_fill_w(rnd_val + 1);
3788 src0_ptr += (3 * src_stride);
3792 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3793 LD_SB2(src0_ptr, src_stride, src3, src4);
3794 src0_ptr += (2 * src_stride);
3795 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796 src1_ptr += (4 * src2_stride);
3798 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3804 src0_ptr += (2 * src_stride);
3812 weight_vec, rnd_vec, offset_vec,
3813 tmp0, tmp1, tmp2, tmp3);
3816 ST_W2(tmp0, 0, 2, dst, dst_stride);
3817 ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819 ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820 dst += (4 * dst_stride);
3838 v16i8
src0,
src1, src2, src3, src4;
3839 v8i16 in0, in1, tmp0, tmp1;
3840 v16i8 src10_r, src32_r, src21_r, src43_r;
3843 v4i32 weight_vec, offset_vec, rnd_vec;
3845 src0_ptr -= src_stride;
3847 offset = (offset0 + offset1) << rnd_val;
3848 weight0 = weight0 & 0x0000FFFF;
3849 weight = weight0 | (weight1 << 16);
3850 constant = 128 * weight1;
3854 offset_vec = __msa_fill_w(
offset);
3855 weight_vec = __msa_fill_w(
weight);
3856 rnd_vec = __msa_fill_w(rnd_val + 1);
3862 src0_ptr += (3 * src_stride);
3866 LD_SB2(src0_ptr, src_stride, src3, src4);
3867 LD_SH2(src1_ptr, src2_stride, in0, in1);
3869 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3874 weight_vec, rnd_vec, offset_vec,
3877 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878 ST_D2(tmp0, 0, 1, dst, dst_stride);
3895 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3896 v8i16 in0, in1, in2, in3, in4, in5;
3897 v16i8 src10_r, src32_r, src54_r, src76_r;
3898 v16i8 src21_r, src43_r, src65_r, src87_r;
3899 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3902 v4i32 weight_vec, offset_vec, rnd_vec;
3904 src0_ptr -= src_stride;
3906 offset = (offset0 + offset1) << rnd_val;
3907 weight0 = weight0 & 0x0000FFFF;
3908 weight = weight0 | (weight1 << 16);
3909 constant = 128 * weight1;
3913 offset_vec = __msa_fill_w(
offset);
3914 weight_vec = __msa_fill_w(
weight);
3915 rnd_vec = __msa_fill_w(rnd_val + 1);
3921 src0_ptr += (3 * src_stride);
3925 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3928 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929 src32_r, src43_r, src54_r, src65_r);
3930 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3940 weight_vec, rnd_vec, offset_vec,
3941 tmp0, tmp1, tmp2, tmp3);
3943 weight_vec, rnd_vec, offset_vec,
3947 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949 ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3968 v16i8
src0,
src1, src2, src3, src4;
3969 v8i16 in0, in1, in2, in3;
3970 v16i8 src10_r, src32_r, src21_r, src43_r;
3971 v8i16 tmp0, tmp1, tmp2, tmp3;
3974 v4i32 weight_vec, offset_vec, rnd_vec;
3976 src0_ptr -= src_stride;
3978 offset = (offset0 + offset1) << rnd_val;
3979 weight0 = weight0 & 0x0000FFFF;
3980 weight = weight0 | (weight1 << 16);
3981 constant = 128 * weight1;
3985 offset_vec = __msa_fill_w(
offset);
3986 weight_vec = __msa_fill_w(
weight);
3987 rnd_vec = __msa_fill_w(rnd_val + 1);
3993 src0_ptr += (3 * src_stride);
3997 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3998 LD_SB2(src0_ptr, src_stride, src3, src4);
3999 src0_ptr += (2 * src_stride);
4000 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001 src1_ptr += (4 * src2_stride);
4003 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4009 src0_ptr += (2 * src_stride);
4017 weight_vec, rnd_vec, offset_vec,
4018 tmp0, tmp1, tmp2, tmp3);
4021 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022 dst += (4 * dst_stride);
4043 weight0, weight1, offset0, offset1, rnd_val);
4044 }
else if (6 ==
height) {
4047 weight0, weight1, offset0, offset1, rnd_val);
4050 src1_ptr, src2_stride,
4052 weight0, weight1, offset0, offset1,
4073 v16i8
src0,
src1, src2, src3, src4, src5;
4074 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075 v16i8 src10_r, src32_r, src21_r, src43_r;
4076 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078 v16i8 src2110, src4332;
4081 v4i32 weight_vec, offset_vec, rnd_vec;
4083 src0_ptr -= (1 * src_stride);
4085 offset = (offset0 + offset1) << rnd_val;
4086 weight0 = weight0 & 0x0000FFFF;
4087 weight = weight0 | (weight1 << 16);
4088 constant = 128 * weight1;
4092 offset_vec = __msa_fill_w(
offset);
4093 weight_vec = __msa_fill_w(
weight);
4094 rnd_vec = __msa_fill_w(rnd_val + 1);
4100 src0_ptr += (3 * src_stride);
4104 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4106 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4107 LD_SB2(src0_ptr, src_stride, src3, src4);
4108 src0_ptr += (2 * src_stride);
4109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111 src1_ptr += (4 * src2_stride);
4115 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4123 LD_SB2(src0_ptr, src_stride, src5, src2);
4124 src0_ptr += (2 * src_stride);
4126 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4135 weight_vec, rnd_vec, offset_vec,
4136 tmp0, tmp1, tmp2, tmp3);
4138 weight_vec, rnd_vec, offset_vec,
4142 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145 dst += (4 * dst_stride);
4165 v16i8
src0,
src1, src2, src3, src4, src5;
4166 v8i16 in0, in1, in2, in3;
4167 v16i8 src10_r, src32_r, src21_r, src43_r;
4168 v16i8 src10_l, src32_l, src21_l, src43_l;
4169 v8i16 tmp0, tmp1, tmp2, tmp3;
4172 v4i32 weight_vec, offset_vec, rnd_vec;
4174 src0_ptr -= src_stride;
4176 offset = (offset0 + offset1) << rnd_val;
4177 weight0 = weight0 & 0x0000FFFF;
4178 weight = weight0 | (weight1 << 16);
4179 constant = 128 * weight1;
4183 offset_vec = __msa_fill_w(
offset);
4184 weight_vec = __msa_fill_w(
weight);
4185 rnd_vec = __msa_fill_w(rnd_val + 1);
4191 src0_ptr += (3 * src_stride);
4196 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4197 LD_SB2(src0_ptr, src_stride, src3, src4);
4198 src0_ptr += (2 * src_stride);
4199 LD_SH2(src1_ptr, src2_stride, in0, in1);
4200 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201 src1_ptr += (2 * src2_stride);
4203 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4213 weight_vec, rnd_vec, offset_vec,
4214 tmp0, tmp1, tmp2, tmp3);
4216 ST_SH2(tmp0, tmp1, dst, dst_stride);
4217 dst += (2 * dst_stride);
4218 LD_SB2(src0_ptr, src_stride, src5, src2);
4219 src0_ptr += (2 * src_stride);
4221 LD_SH2(src1_ptr, src2_stride, in0, in1);
4222 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223 src1_ptr += (2 * src2_stride);
4225 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4234 weight_vec, rnd_vec, offset_vec,
4235 tmp0, tmp1, tmp2, tmp3);
4238 ST_SH2(tmp0, tmp1, dst, dst_stride);
4239 dst += (2 * dst_stride);
4259 v16i8
src0,
src1, src2, src3, src4, src5;
4260 v16i8 src6, src7, src8, src9, src10, src11;
4261 v8i16 in0, in1, in2, in3, in4, in5;
4262 v16i8 src10_r, src32_r, src76_r, src98_r;
4263 v16i8 src10_l, src32_l, src21_l, src43_l;
4264 v16i8 src21_r, src43_r, src87_r, src109_r;
4265 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4268 v4i32 weight_vec, offset_vec, rnd_vec;
4270 src0_ptr -= src_stride;
4272 offset = (offset0 + offset1) << rnd_val;
4273 weight0 = weight0 & 0x0000FFFF;
4274 weight = weight0 | (weight1 << 16);
4275 constant = 128 * weight1;
4279 offset_vec = __msa_fill_w(
offset);
4280 weight_vec = __msa_fill_w(
weight);
4281 rnd_vec = __msa_fill_w(rnd_val + 1);
4292 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293 src0_ptr += (3 * src_stride);
4295 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4297 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4299 LD_SB2(src0_ptr, src_stride, src3, src4);
4300 LD_SH2(src1_ptr, src2_stride, in0, in1);
4301 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4303 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4307 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308 src0_ptr += (2 * src_stride);
4309 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310 src1_ptr += (2 * src2_stride);
4312 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4324 weight_vec, rnd_vec, offset_vec,
4325 tmp0, tmp1, tmp4, tmp5);
4328 weight_vec, rnd_vec, offset_vec,
4333 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334 ST_SH2(tmp0, tmp1, dst, dst_stride);
4335 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336 dst += (2 * dst_stride);
4339 LD_SB2(src0_ptr, src_stride, src5, src2);
4340 LD_SH2(src1_ptr, src2_stride, in0, in1);
4341 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4343 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4346 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347 src0_ptr += (2 * src_stride);
4348 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349 src1_ptr += (2 * src2_stride);
4351 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4363 weight_vec, rnd_vec, offset_vec,
4364 tmp0, tmp1, tmp4, tmp5);
4367 weight_vec, rnd_vec, offset_vec,
4373 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374 ST_SH2(tmp0, tmp1, dst, dst_stride);
4375 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376 dst += (2 * dst_stride);
4397 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399 v16i8 src10_r, src32_r, src76_r, src98_r;
4400 v16i8 src21_r, src43_r, src87_r, src109_r;
4401 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402 v16i8 src10_l, src32_l, src76_l, src98_l;
4403 v16i8 src21_l, src43_l, src87_l, src109_l;
4406 v4i32 weight_vec, offset_vec, rnd_vec;
4408 src0_ptr -= src_stride;
4410 offset = (offset0 + offset1) << rnd_val;
4411 weight0 = weight0 & 0x0000FFFF;
4412 weight = weight0 | (weight1 << 16);
4413 constant = 128 * weight1;
4417 offset_vec = __msa_fill_w(
offset);
4418 weight_vec = __msa_fill_w(
weight);
4419 rnd_vec = __msa_fill_w(rnd_val + 1);
4430 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431 src0_ptr += (3 * src_stride);
4433 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4436 for (loop_cnt = (
height >> 1); loop_cnt--;) {
4438 LD_SB2(src0_ptr, src_stride, src3, src4);
4439 LD_SH2(src1_ptr, src2_stride, in0, in1);
4440 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4442 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4453 weight_vec, rnd_vec, offset_vec,
4454 tmp0, tmp1, tmp4, tmp5);
4457 ST_SH2(tmp0, tmp1, dst, dst_stride);
4458 dst += (2 * dst_stride);
4467 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468 src0_ptr += (2 * src_stride);
4469 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471 src1_ptr += (2 * src2_stride);
4473 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4483 weight_vec, rnd_vec, offset_vec,
4484 tmp2, tmp3, tmp6, tmp7);
4488 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489 dst_tmp += (2 * dst_stride);
4505 const int8_t *filter_x,
4506 const int8_t *filter_y,
4517 v16i8
src0,
src1, src2, src3, src4;
4519 v8i16 filt_h0, filt_h1;
4522 v8i16 filter_vec,
tmp, weight_vec;
4523 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4527 src0_ptr -= (src_stride + 1);
4529 filter_vec =
LD_SH(filter_x);
4532 filter_vec =
LD_SH(filter_y);
4539 offset = (offset0 + offset1) << rnd_val;
4540 weight0 = weight0 & 0x0000FFFF;
4541 weight = weight0 | (weight1 << 16);
4543 const_vec = __msa_fill_w((128 * weight1));
4545 offset_vec = __msa_fill_w(
offset);
4546 weight_vec = (v8i16) __msa_fill_w(
weight);
4547 rnd_vec = __msa_fill_w(rnd_val + 1);
4548 offset_vec += const_vec;
4555 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4568 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4570 LD2(src1_ptr, src2_stride, tp0, tp1);
4574 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4577 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4579 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
4589 const int8_t *filter_x,
4590 const int8_t *filter_y,
4600 v8i16 in0 = { 0 }, in1 = { 0 };
4601 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4603 v8i16 filt_h0, filt_h1;
4606 v8i16 filter_vec, weight_vec;
4607 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608 v8i16 tmp0, tmp1, tmp2, tmp3;
4609 v8i16 dst30, dst41, dst52, dst63;
4610 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611 v4i32 offset_vec, rnd_vec, const_vec;
4612 v4i32 dst0, dst1, dst2, dst3;
4614 src0_ptr -= (src_stride + 1);
4616 filter_vec =
LD_SH(filter_x);
4619 filter_vec =
LD_SH(filter_y);
4626 offset = (offset0 + offset1) << rnd_val;
4627 weight0 = weight0 & 0x0000FFFF;
4628 weight = weight0 | (weight1 << 16);
4630 const_vec = __msa_fill_w((128 * weight1));
4632 offset_vec = __msa_fill_w(
offset);
4633 weight_vec = (v8i16) __msa_fill_w(
weight);
4634 rnd_vec = __msa_fill_w(rnd_val + 1);
4635 offset_vec += const_vec;
4637 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
4642 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4657 SRA_4V(dst0, dst1, dst2, dst3, 6);
4660 LD2(src1_ptr, src2_stride, tp0, tp1);
4662 src1_ptr += (2 * src2_stride);
4663 LD2(src1_ptr, src2_stride, tp0, tp1);
4669 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4676 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
4686 const int8_t *filter_x,
4687 const int8_t *filter_y,
4699 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4702 v8i16 filt_h0, filt_h1;
4703 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4706 v8i16 filter_vec, weight_vec;
4707 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711 v8i16 dst98_r, dst109_r;
4712 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713 v4i32 offset_vec, rnd_vec, const_vec;
4715 src0_ptr -= (src_stride + 1);
4717 filter_vec =
LD_SH(filter_x);
4720 filter_vec =
LD_SH(filter_y);
4727 offset = (offset0 + offset1) << rnd_val;
4728 weight0 = weight0 & 0x0000FFFF;
4729 weight = weight0 | (weight1 << 16);
4731 const_vec = __msa_fill_w((128 * weight1));
4733 offset_vec = __msa_fill_w(
offset);
4734 weight_vec = (v8i16) __msa_fill_w(
weight);
4735 rnd_vec = __msa_fill_w(rnd_val + 1);
4736 offset_vec += const_vec;
4739 src0_ptr += (3 * src_stride);
4747 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4749 for (loop_cnt =
height >> 3; loop_cnt--;) {
4750 LD_SB8(src0_ptr, src_stride,
4751 src3, src4, src5, src6, src7, src8, src9, src10);
4752 src0_ptr += (8 * src_stride);
4754 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4764 dst32_r = __msa_ilvr_h(dst73, dst22);
4768 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769 dst76_r = __msa_ilvr_h(dst22, dst106);
4771 LD2(src1_ptr, src2_stride, tp0, tp1);
4772 src1_ptr += 2 * src2_stride;
4774 LD2(src1_ptr, src2_stride, tp0, tp1);
4775 src1_ptr += 2 * src2_stride;
4778 LD2(src1_ptr, src2_stride, tp0, tp1);
4779 src1_ptr += 2 * src2_stride;
4781 LD2(src1_ptr, src2_stride, tp0, tp1);
4782 src1_ptr += 2 * src2_stride;
4793 SRA_4V(dst0, dst1, dst2, dst3, 6);
4794 SRA_4V(dst4, dst5, dst6, dst7, 6);
4795 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4801 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4811 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4815 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816 dst += (8 * dst_stride);
4820 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4830 const int8_t *filter_x,
4831 const int8_t *filter_y,
4841 dst, dst_stride, filter_x, filter_y,
4842 weight0, weight1, offset0, offset1, rnd_val);
4843 }
else if (4 ==
height) {
4845 dst, dst_stride, filter_x, filter_y,
4846 weight0, weight1, offset0, offset1, rnd_val);
4847 }
else if (0 == (
height % 8)) {
4849 src1_ptr, src2_stride,
4850 dst, dst_stride, filter_x, filter_y,
4851 height, weight0, weight1,
4852 offset0, offset1, rnd_val);
4862 const int8_t *filter_x,
4863 const int8_t *filter_y,
4871 uint32_t tpw0, tpw1, tpw2, tpw3;
4874 v16u8 out0, out1, out2;
4875 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877 v8i16 in4 = { 0 }, in5 = { 0 };
4879 v8i16 filt_h0, filt_h1, filter_vec;
4880 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4883 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892 v4i32 offset_vec, rnd_vec, const_vec;
4894 src0_ptr -= (src_stride + 1);
4896 filter_vec =
LD_SH(filter_x);
4899 filter_vec =
LD_SH(filter_y);
4906 offset = (offset0 + offset1) << rnd_val;
4907 weight0 = weight0 & 0x0000FFFF;
4908 weight = weight0 | (weight1 << 16);
4910 const_vec = __msa_fill_w((128 * weight1));
4912 offset_vec = __msa_fill_w(
offset);
4913 weight_vec = (v8i16) __msa_fill_w(
weight);
4914 rnd_vec = __msa_fill_w(rnd_val + 1);
4915 offset_vec += const_vec;
4918 src0_ptr += (3 * src_stride);
4923 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4931 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4935 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4945 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4963 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4978 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4985 LD2(src1_ptr, src2_stride, tp0, tp1);
4987 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4990 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4992 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4999 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5009 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5013 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5015 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5017 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018 src1_ptr += (4 * src2_stride);
5020 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5026 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5034 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5044 const int8_t *filter_x,
5045 const int8_t *filter_y,
5054 v16i8
src0,
src1, src2, src3, src4;
5056 v8i16 filt_h0, filt_h1;
5059 v8i16 filter_vec, weight_vec;
5060 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061 v8i16 dst0, dst1, dst2, dst3, dst4;
5063 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066 v8i16 tmp0, tmp1, tmp2, tmp3;
5067 v4i32 offset_vec, rnd_vec, const_vec;
5069 src0_ptr -= (src_stride + 1);
5071 filter_vec =
LD_SH(filter_x);
5074 filter_vec =
LD_SH(filter_y);
5081 offset = (offset0 + offset1) << rnd_val;
5082 weight0 = weight0 & 0x0000FFFF;
5083 weight = weight0 | (weight1 << 16);
5085 const_vec = __msa_fill_w((128 * weight1));
5087 offset_vec = __msa_fill_w(
offset);
5088 weight_vec = (v8i16) __msa_fill_w(
weight);
5089 rnd_vec = __msa_fill_w(rnd_val + 1);
5090 offset_vec += const_vec;
5095 LD_SH2(src1_ptr, src2_stride, in0, in1);
5099 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5117 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5123 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5130 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5140 const int8_t *filter_x,
5141 const int8_t *filter_y,
5152 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
5153 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161 v4i32 offset_vec, rnd_vec, const_vec;
5163 src0_ptr -= (src_stride + 1);
5165 filter_vec =
LD_SH(filter_x);
5168 filter_vec =
LD_SH(filter_y);
5176 offset = (offset0 + offset1) << rnd_val;
5177 weight0 = weight0 & 0x0000FFFF;
5178 weight = weight0 | (weight1 << 16);
5180 const_vec = __msa_fill_w((128 * weight1));
5182 offset_vec = __msa_fill_w(
offset);
5183 rnd_vec = __msa_fill_w(rnd_val + 1);
5184 offset_vec += const_vec;
5185 weight_vec = (v8i16) __msa_fill_w(
weight);
5187 for (cnt = width8mult; cnt--;) {
5188 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
5192 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5197 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5206 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5230 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233 dst3_r, dst0, dst1, dst2, dst3);
5239 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5249 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250 tmp0, tmp1, tmp2, tmp3);
5253 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5264 const int8_t *filter_x,
5265 const int8_t *filter_y,
5273 v16u8 out0, out1, out2;
5274 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5276 v8i16 filt_h0, filt_h1;
5279 v8i16 filter_vec, weight_vec;
5280 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289 v8i16 in0, in1, in2, in3, in4, in5;
5290 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292 v4i32 offset_vec, rnd_vec, const_vec;
5294 src0_ptr -= (src_stride + 1);
5296 filter_vec =
LD_SH(filter_x);
5299 filter_vec =
LD_SH(filter_y);
5306 offset = (offset0 + offset1) << rnd_val;
5307 weight0 = weight0 & 0x0000FFFF;
5308 weight = weight0 | (weight1 << 16);
5310 const_vec = __msa_fill_w((128 * weight1));
5312 offset_vec = __msa_fill_w(
offset);
5313 weight_vec = (v8i16) __msa_fill_w(
weight);
5314 rnd_vec = __msa_fill_w(rnd_val + 1);
5315 offset_vec += const_vec;
5318 src0_ptr += (5 * src_stride);
5319 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5324 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5328 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5368 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372 dst0, dst1, dst2, dst3);
5378 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5388 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389 tmp0, tmp1, tmp2, tmp3);
5393 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5396 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5403 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5414 const int8_t *filter_x,
5415 const int8_t *filter_y,
5428 int16_t *src1_ptr_tmp;
5431 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5432 v8i16 in0, in1, in2, in3;
5434 v8i16 filt_h0, filt_h1;
5438 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445 v4i32 offset_vec, rnd_vec, const_vec;
5447 src0_ptr -= (src_stride + 1);
5449 filter_vec =
LD_SH(filter_x);
5452 filter_vec =
LD_SH(filter_y);
5459 offset = (offset0 + offset1) << rnd_val;
5460 weight0 = weight0 & 0x0000FFFF;
5461 weight = weight0 | (weight1 << 16);
5463 const_vec = __msa_fill_w((128 * weight1));
5465 offset_vec = __msa_fill_w(
offset);
5466 weight_vec = (v8i16) __msa_fill_w(
weight);
5467 rnd_vec = __msa_fill_w(rnd_val + 1);
5468 offset_vec += const_vec;
5470 for (cnt =
width >> 3; cnt--;) {
5471 src0_ptr_tmp = src0_ptr;
5472 src1_ptr_tmp = src1_ptr;
5476 src0_ptr_tmp += (3 * src_stride);
5481 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5489 for (loop_cnt =
height >> 2; loop_cnt--;) {
5490 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491 src0_ptr_tmp += (4 * src_stride);
5492 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493 src1_ptr_tmp += (4 * src2_stride);
5496 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5520 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523 dst3_r, dst0, dst1, dst2, dst3);
5528 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5538 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539 tmp0, tmp1, tmp2, tmp3);
5542 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543 dst_tmp += (4 * dst_stride);
5564 const int8_t *filter_x,
5565 const int8_t *filter_y,
5575 dst, dst_stride, filter_x, filter_y,
5576 weight0, weight1, offset0, offset1, rnd_val);
5577 }
else if (4 ==
height) {
5579 src2_stride, dst, dst_stride, filter_x,
5580 filter_y, weight0, weight1, offset0,
5581 offset1, rnd_val, 1);
5582 }
else if (6 ==
height) {
5584 dst, dst_stride, filter_x, filter_y,
5585 weight0, weight1, offset0, offset1, rnd_val);
5586 }
else if (0 == (
height % 4)) {
5588 src1_ptr, src2_stride,
5589 dst, dst_stride, filter_x, filter_y,
5591 weight1, offset0, offset1, rnd_val, 8);
5601 const int8_t *filter_x,
5602 const int8_t *filter_y,
5613 uint8_t *src0_ptr_tmp, *dst_tmp;
5614 int16_t *src1_ptr_tmp;
5616 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5618 v16i8 mask0, mask1, mask2, mask3;
5619 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5620 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5621 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5622 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5623 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5624 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5625 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5626 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5627 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5628 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5629 v4i32 offset_vec, rnd_vec, const_vec;
5631 src0_ptr -= (src_stride + 1);
5633 filter_vec =
LD_SH(filter_x);
5636 filter_vec =
LD_SH(filter_y);
5644 offset = (offset0 + offset1) << rnd_val;
5645 weight0 = weight0 & 0x0000FFFF;
5646 weight = weight0 | (weight1 << 16);
5648 const_vec = __msa_fill_w((128 * weight1));
5650 offset_vec = __msa_fill_w(
offset);
5651 rnd_vec = __msa_fill_w(rnd_val + 1);
5652 offset_vec += const_vec;
5653 weight_vec = (v8i16) __msa_fill_w(
weight);
5655 src0_ptr_tmp = src0_ptr;
5657 src1_ptr_tmp = src1_ptr;
5660 src0_ptr_tmp += (3 * src_stride);
5666 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5675 for (loop_cnt = 4; loop_cnt--;) {
5676 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5677 src0_ptr_tmp += (4 * src_stride);
5680 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5681 src1_ptr_tmp += (4 * src2_stride);
5683 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5684 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5685 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5686 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5707 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5708 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5709 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5710 dst3_r, dst0, dst1, dst2, dst3);
5715 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5716 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5717 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5718 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5719 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5720 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5721 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5722 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5725 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5726 tmp0, tmp1, tmp2, tmp3);
5729 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5730 dst_tmp += (4 * dst_stride);
5747 src0_ptr += (3 * src_stride);
5756 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5758 for (loop_cnt = 2; loop_cnt--;) {
5759 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5761 src0_ptr += (8 * src_stride);
5763 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5764 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5765 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5766 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5773 dst32_r = __msa_ilvr_h(dst73, dst22);
5777 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5778 dst76_r = __msa_ilvr_h(dst22, dst106);
5780 LD2(src1_ptr, src2_stride, tp0, tp1);
5781 src1_ptr += 2 * src2_stride;
5783 LD2(src1_ptr, src2_stride, tp0, tp1);
5784 src1_ptr += 2 * src2_stride;
5787 LD2(src1_ptr, src2_stride, tp0, tp1);
5788 src1_ptr += 2 * src2_stride;
5790 LD2(src1_ptr, src2_stride, tp0, tp1);
5791 src1_ptr += 2 * src2_stride;
5803 SRA_4V(dst0, dst1, dst2, dst3, 6);
5804 SRA_4V(dst4, dst5, dst6, dst7, 6);
5805 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5806 dst0, dst1, dst2, dst3);
5811 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5812 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5813 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5814 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5815 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5816 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5817 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5818 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5821 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5822 tmp0, tmp1, tmp2, tmp3);
5825 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5826 dst += (8 * dst_stride);
5830 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5840 const int8_t *filter_x,
5841 const int8_t *filter_y,
5851 src2_stride, dst, dst_stride, filter_x,
5852 filter_y, weight0, weight1, offset0,
5853 offset1, rnd_val, 2);
5856 src2_stride, dst, dst_stride,
5857 filter_x, filter_y,
height, weight0,
5858 weight1, offset0, offset1, rnd_val, 16);
5868 const int8_t *filter_x,
5869 const int8_t *filter_y,
5878 src1_ptr, src2_stride,
5880 filter_x, filter_y,
height, weight0,
5881 weight1, offset0, offset1, rnd_val, 24);
5890 const int8_t *filter_x,
5891 const int8_t *filter_y,
5900 src1_ptr, src2_stride,
5902 filter_x, filter_y,
height, weight0,
5903 weight1, offset0, offset1, rnd_val, 32);
5906 #define BI_W_MC_COPY(WIDTH) \
5907 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5908 ptrdiff_t dst_stride, \
5910 ptrdiff_t src_stride, \
5911 int16_t *src_16bit, \
5922 int shift = 14 + 1 - 8; \
5923 int log2Wd = denom + shift - 1; \
5925 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5926 dst, dst_stride, height, \
5927 weight0, weight1, offset0, \
5943 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5944 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5950 int16_t *src_16bit, \
5961 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5962 int log2Wd = denom + 14 - 8; \
5964 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5965 MAX_PB_SIZE, dst, dst_stride, \
5966 filter, height, weight0, \
5967 weight1, offset0, offset1, \
5980 BI_W_MC(qpel, v, 4, 8, vt, my);
5981 BI_W_MC(qpel, v, 8, 8, vt, my);
5982 BI_W_MC(qpel, v, 12, 8, vt, my);
5983 BI_W_MC(qpel, v, 16, 8, vt, my);
5984 BI_W_MC(qpel, v, 24, 8, vt, my);
5985 BI_W_MC(qpel, v, 32, 8, vt, my);
5986 BI_W_MC(qpel, v, 48, 8, vt, my);
5987 BI_W_MC(qpel, v, 64, 8, vt, my);
5997 BI_W_MC(epel, v, 4, 4, vt, my);
5998 BI_W_MC(epel, v, 8, 4, vt, my);
5999 BI_W_MC(epel, v, 6, 4, vt, my);
6000 BI_W_MC(epel, v, 12, 4, vt, my);
6001 BI_W_MC(epel, v, 16, 4, vt, my);
6002 BI_W_MC(epel, v, 24, 4, vt, my);
6003 BI_W_MC(epel, v, 32, 4, vt, my);
6007 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6008 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6009 ptrdiff_t dst_stride, \
6011 ptrdiff_t src_stride, \
6012 int16_t *src_16bit, \
6023 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6024 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6025 int log2Wd = denom + 14 - 8; \
6027 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6028 MAX_PB_SIZE, dst, dst_stride, \
6029 filter_x, filter_y, height, \
6030 weight0, weight1, offset0, \