27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
33 int16_t *dst,
int32_t dst_stride,
45 in0 = (v8i16) __msa_ilvr_b(
zero,
src0);
47 ST_D2(in0, 0, 1, dst, dst_stride);
58 ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
59 }
else if (0 ==
height % 8) {
60 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
61 v8i16 in0, in1, in2, in3;
64 for (loop_cnt = (
height >> 3); loop_cnt--;) {
66 src0,
src1, src2, src3, src4, src5, src6, src7);
67 src += (8 * src_stride);
74 ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
75 dst += (8 * dst_stride);
81 int16_t *dst,
int32_t dst_stride,
86 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
87 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
89 for (loop_cnt = (
height >> 3); loop_cnt--;) {
91 src += (8 * src_stride);
99 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
100 dst += (8 * dst_stride);
105 int16_t *dst,
int32_t dst_stride,
119 ST_SH2(in0, in1, dst, dst_stride);
122 v8i16 in0, in1, in2, in3;
128 SLLI_4V(in0, in1, in2, in3, 6);
129 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
131 v16i8
src0,
src1, src2, src3, src4, src5;
132 v8i16 in0, in1, in2, in3, in4, in5;
139 SLLI_4V(in0, in1, in2, in3, 6);
142 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
143 }
else if (0 ==
height % 8) {
145 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
146 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
148 for (loop_cnt = (
height >> 3); loop_cnt--;) {
150 src0,
src1, src2, src3, src4, src5, src6, src7);
151 src += (8 * src_stride);
157 SLLI_4V(in0, in1, in2, in3, 6);
158 SLLI_4V(in4, in5, in6, in7, 6);
159 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
160 dst += (8 * dst_stride);
166 int16_t *dst,
int32_t dst_stride,
171 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
172 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
174 for (loop_cnt = (
height >> 3); loop_cnt--;) {
176 src += (8 * src_stride);
179 in0_r, in1_r, in2_r, in3_r);
180 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
185 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
186 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
187 dst += (4 * dst_stride);
190 in0_r, in1_r, in2_r, in3_r);
191 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
196 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
197 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
198 dst += (4 * dst_stride);
203 int16_t *dst,
int32_t dst_stride,
210 v8i16 in0_r, in1_r, in2_r, in3_r;
211 v8i16 in0_l, in1_l, in2_l, in3_l;
216 in0_r, in1_r, in2_r, in3_r);
218 in0_l, in1_l, in2_l, in3_l);
219 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
220 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
221 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
222 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
223 }
else if (12 ==
height) {
224 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
225 v16i8 src8, src9, src10, src11;
226 v8i16 in0_r, in1_r, in2_r, in3_r;
227 v8i16 in0_l, in1_l, in2_l, in3_l;
230 src += (8 * src_stride);
231 LD_SB4(
src, src_stride, src8, src9, src10, src11);
234 in0_r, in1_r, in2_r, in3_r);
236 in0_l, in1_l, in2_l, in3_l);
237 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
238 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
239 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
240 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
241 dst += (4 * dst_stride);
244 in0_r, in1_r, in2_r, in3_r);
246 in0_l, in1_l, in2_l, in3_l);
247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251 dst += (4 * dst_stride);
254 in0_r, in1_r, in2_r, in3_r);
256 in0_l, in1_l, in2_l, in3_l);
257 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
258 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
259 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
260 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
261 }
else if (0 == (
height % 8)) {
263 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
264 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
266 for (loop_cnt = (
height >> 3); loop_cnt--;) {
269 src += (8 * src_stride);
271 in1_r, in2_r, in3_r);
273 in1_l, in2_l, in3_l);
274 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
275 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
276 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
277 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
278 dst += (4 * dst_stride);
281 in1_r, in2_r, in3_r);
283 in1_l, in2_l, in3_l);
284 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
285 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
286 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
287 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
288 dst += (4 * dst_stride);
294 int16_t *dst,
int32_t dst_stride,
299 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
300 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
302 for (loop_cnt = (
height >> 2); loop_cnt--;) {
304 LD_SB4((
src + 16), src_stride, src4, src5, src6, src7);
305 src += (4 * src_stride);
306 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, in0_r, in1_r,
308 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, in0_l, in1_l,
310 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
311 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
312 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
313 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
314 ILVR_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_r, in1_r,
316 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
317 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
318 dst += (4 * dst_stride);
323 int16_t *dst,
int32_t dst_stride,
328 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
329 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
331 for (loop_cnt = (
height >> 2); loop_cnt--;) {
334 src += (4 * src_stride);
336 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, in0_r, in1_r,
338 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, in0_l, in1_l,
340 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
341 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
342 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
344 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
347 ILVR_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_r, in1_r,
349 ILVL_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_l, in1_l,
351 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
352 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
353 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
355 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
361 int16_t *dst,
int32_t dst_stride,
366 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
367 v16i8 src8, src9, src10, src11;
368 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
369 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
371 for (loop_cnt = (
height >> 2); loop_cnt--;) {
382 in0_r, in1_r, in2_r, in3_r);
384 in0_l, in1_l, in2_l, in3_l);
387 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
388 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
389 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
390 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
392 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
396 in0_r, in1_r, in2_r, in3_r);
398 in0_l, in1_l, in2_l, in3_l);
401 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
402 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
403 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
404 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
406 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
412 int16_t *dst,
int32_t dst_stride,
417 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
418 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
420 for (loop_cnt = (
height >> 1); loop_cnt--;) {
427 in0_r, in1_r, in2_r, in3_r);
429 in0_l, in1_l, in2_l, in3_l);
430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
433 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
437 in0_r, in1_r, in2_r, in3_r);
439 in0_l, in1_l, in2_l, in3_l);
440 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
441 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
442 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
443 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
449 int16_t *dst,
int32_t dst_stride,
453 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
454 v8i16 filt0, filt1, filt2, filt3;
455 v16i8 mask1, mask2, mask3;
456 v16i8 vec0, vec1, vec2, vec3;
457 v8i16 dst0, dst1, dst2, dst3;
458 v8i16 filter_vec, const_vec;
462 const_vec = __msa_ldi_h(128);
466 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
472 for (loop_cnt = (
height >> 3); loop_cnt--;) {
474 src += (8 * src_stride);
478 vec0, vec1, vec2, vec3);
480 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
481 dst0, dst0, dst0, dst0);
482 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
483 vec0, vec1, vec2, vec3);
485 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
486 dst1, dst1, dst1, dst1);
487 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
488 vec0, vec1, vec2, vec3);
490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491 dst2, dst2, dst2, dst2);
492 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
493 vec0, vec1, vec2, vec3);
495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496 dst3, dst3, dst3, dst3);
498 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
499 dst += (8 * dst_stride);
504 int16_t *dst,
int32_t dst_stride,
509 v8i16 filt0, filt1, filt2, filt3;
510 v16i8 mask1, mask2, mask3;
511 v16i8 vec0, vec1, vec2, vec3;
512 v8i16 dst0, dst1, dst2, dst3;
513 v8i16 filter_vec, const_vec;
517 const_vec = __msa_ldi_h(128);
521 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
527 for (loop_cnt = (
height >> 2); loop_cnt--;) {
529 src += (4 * src_stride);
533 vec0, vec1, vec2, vec3);
535 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
536 dst0, dst0, dst0, dst0);
538 vec0, vec1, vec2, vec3);
540 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
541 dst1, dst1, dst1, dst1);
542 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
543 vec0, vec1, vec2, vec3);
545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
546 dst2, dst2, dst2, dst2);
547 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
548 vec0, vec1, vec2, vec3);
550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551 dst3, dst3, dst3, dst3);
553 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
554 dst += (4 * dst_stride);
559 int16_t *dst,
int32_t dst_stride,
563 int64_t res0, res1, res2, res3;
564 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
565 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
567 v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
568 v8i16 filter_vec, const_vec;
571 const_vec = __msa_ldi_h(128);
575 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
586 for (loop_cnt = 4; loop_cnt--;) {
588 LD_SB4(
src + 8, src_stride, src4, src5, src6, src7);
589 src += (4 * src_stride);
599 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
600 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
605 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
606 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
611 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
612 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
613 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
617 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
618 VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
623 res0 = __msa_copy_s_d((v2i64) dst4, 0);
624 res1 = __msa_copy_s_d((v2i64) dst4, 1);
625 res2 = __msa_copy_s_d((v2i64) dst5, 0);
626 res3 = __msa_copy_s_d((v2i64) dst5, 1);
627 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
628 SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
629 dst += (4 * dst_stride);
634 int16_t *dst,
int32_t dst_stride,
639 v8i16 filt0, filt1, filt2, filt3;
640 v16i8 mask1, mask2, mask3;
641 v16i8 vec0, vec1, vec2, vec3;
642 v8i16 dst0, dst1, dst2, dst3;
643 v8i16 filter_vec, const_vec;
647 const_vec = __msa_ldi_h(128);
651 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
657 for (loop_cnt = (
height >> 1); loop_cnt--;) {
660 src += (2 * src_stride);
668 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
669 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
672 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
676 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
677 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
680 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
681 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
684 ST_SH2(dst0, dst2, dst, dst_stride);
685 ST_SH2(dst1, dst3, dst + 8, dst_stride);
686 dst += (2 * dst_stride);
691 int16_t *dst,
int32_t dst_stride,
696 v8i16 filt0, filt1, filt2, filt3;
697 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
698 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
699 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
700 v8i16 filter_vec, const_vec;
705 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
715 const_vec = __msa_ldi_h(128);
718 for (loop_cnt = (
height >> 1); loop_cnt--;) {
733 VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5);
734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
739 VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5);
740 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
745 VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5);
746 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
751 VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5);
752 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
756 ST_SH2(dst0, dst1, dst, 8);
757 ST_SH(dst2, dst + 16);
759 ST_SH2(dst3, dst4, dst, 8);
760 ST_SH(dst5, dst + 16);
766 int16_t *dst,
int32_t dst_stride,
771 v8i16 filt0, filt1, filt2, filt3;
772 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
773 v16i8 vec0, vec1, vec2, vec3;
774 v8i16 dst0, dst1, dst2, dst3;
775 v8i16 filter_vec, const_vec;
780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
790 const_vec = __msa_ldi_h(128);
793 for (loop_cnt =
height; loop_cnt--;) {
800 vec0, vec1, vec2, vec3);
802 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
803 dst0, dst0, dst0, dst0);
805 vec0, vec1, vec2, vec3);
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
808 dst1, dst1, dst1, dst1);
810 vec0, vec1, vec2, vec3);
812 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
813 dst2, dst2, dst2, dst2);
814 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
815 vec0, vec1, vec2, vec3);
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
818 dst3, dst3, dst3, dst3);
820 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
826 int16_t *dst,
int32_t dst_stride,
831 v8i16 filt0, filt1, filt2, filt3;
832 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
833 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
834 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
835 v8i16 filter_vec, const_vec;
840 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
850 const_vec = __msa_ldi_h(128);
853 for (loop_cnt =
height; loop_cnt--;) {
867 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
881 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
883 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5);
885 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5);
887 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5);
889 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5);
891 ST_SH2(dst4, dst5, (dst + 32), 8);
897 int16_t *dst,
int32_t dst_stride,
902 v8i16 filt0, filt1, filt2, filt3;
903 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
904 v16i8 vec0, vec1, vec2, vec3;
905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
906 v8i16 filter_vec, const_vec;
912 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
922 const_vec = __msa_ldi_h(128);
925 for (loop_cnt =
height; loop_cnt--;) {
932 vec0, vec1, vec2, vec3);
934 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
935 dst0, dst0, dst0, dst0);
939 vec0, vec1, vec2, vec3);
941 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
942 dst1, dst1, dst1, dst1);
943 ST_SH(dst1, dst + 8);
946 vec0, vec1, vec2, vec3);
948 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
949 dst2, dst2, dst2, dst2);
950 ST_SH(dst2, dst + 16);
953 vec0, vec1, vec2, vec3);
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
956 dst3, dst3, dst3, dst3);
957 ST_SH(dst3, dst + 24);
959 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
963 dst4, dst4, dst4, dst4);
964 ST_SH(dst4, dst + 32);
966 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
967 vec0, vec1, vec2, vec3);
969 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
970 dst5, dst5, dst5, dst5);
971 ST_SH(dst5, dst + 40);
973 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
974 vec0, vec1, vec2, vec3);
976 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
977 dst6, dst6, dst6, dst6);
978 ST_SH(dst6, dst + 48);
980 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
981 vec0, vec1, vec2, vec3);
983 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
984 dst7, dst7, dst7, dst7);
985 ST_SH(dst7, dst + 56);
991 int16_t *dst,
int32_t dst_stride,
995 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
996 v16i8 src9, src10, src11, src12, src13, src14;
997 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
998 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
999 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1000 v16i8 src2110, src4332, src6554, src8776, src10998;
1001 v16i8 src12111110, src14131312;
1002 v8i16 dst10, dst32, dst54, dst76;
1003 v8i16 filt0, filt1, filt2, filt3;
1004 v8i16 filter_vec, const_vec;
1006 src -= (3 * src_stride);
1008 const_vec = __msa_ldi_h(128);
1012 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1015 src += (7 * src_stride);
1017 src10_r, src32_r, src54_r, src21_r);
1018 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1020 src2110, src4332, src6554);
1023 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1025 src7, src8, src9, src10, src11, src12, src13, src14);
1026 src += (8 * src_stride);
1028 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1029 src76_r, src87_r, src98_r, src109_r);
1030 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1031 src1110_r, src1211_r, src1312_r, src1413_r);
1032 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1033 src1211_r, src1110_r, src1413_r, src1312_r,
1034 src8776, src10998, src12111110, src14131312);
1039 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1042 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1045 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1047 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1048 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1050 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1051 dst += (8 * dst_stride);
1054 src4332 = src12111110;
1055 src6554 = src14131312;
1061 int16_t *dst,
int32_t dst_stride,
1065 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1066 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1067 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1068 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1069 v8i16 filter_vec, const_vec;
1070 v8i16 filt0, filt1, filt2, filt3;
1072 src -= (3 * src_stride);
1073 const_vec = __msa_ldi_h(128);
1077 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1080 src += (7 * src_stride);
1083 src10_r, src32_r, src54_r, src21_r);
1084 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1086 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1087 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1088 src += (4 * src_stride);
1090 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1091 src76_r, src87_r, src98_r, src109_r);
1095 filt0, filt1, filt2, filt3,
1096 dst0_r, dst0_r, dst0_r, dst0_r);
1099 filt0, filt1, filt2, filt3,
1100 dst1_r, dst1_r, dst1_r, dst1_r);
1103 filt0, filt1, filt2, filt3,
1104 dst2_r, dst2_r, dst2_r, dst2_r);
1107 filt0, filt1, filt2, filt3,
1108 dst3_r, dst3_r, dst3_r, dst3_r);
1110 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1111 dst += (4 * dst_stride);
1124 int16_t *dst,
int32_t dst_stride,
1128 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1129 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1130 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1131 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1132 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1133 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1134 v16i8 src2110, src4332, src6554, src8776, src10998;
1135 v8i16 dst0_l, dst1_l;
1136 v8i16 filter_vec, const_vec;
1137 v8i16 filt0, filt1, filt2, filt3;
1139 src -= (3 * src_stride);
1140 const_vec = __msa_ldi_h(128);
1144 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1147 src += (7 * src_stride);
1150 src10_r, src32_r, src54_r, src21_r);
1151 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1153 src10_l, src32_l, src54_l, src21_l);
1154 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1155 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1156 src2110, src4332, src6554);
1158 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1159 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1160 src += (4 * src_stride);
1162 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1163 src76_r, src87_r, src98_r, src109_r);
1164 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1165 src76_l, src87_l, src98_l, src109_l);
1166 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1170 filt0, filt1, filt2, filt3,
1171 dst0_r, dst0_r, dst0_r, dst0_r);
1174 filt0, filt1, filt2, filt3,
1175 dst1_r, dst1_r, dst1_r, dst1_r);
1178 filt0, filt1, filt2, filt3,
1179 dst2_r, dst2_r, dst2_r, dst2_r);
1182 filt0, filt1, filt2, filt3,
1183 dst3_r, dst3_r, dst3_r, dst3_r);
1186 filt0, filt1, filt2, filt3,
1187 dst0_l, dst0_l, dst0_l, dst0_l);
1190 filt0, filt1, filt2, filt3,
1191 dst1_l, dst1_l, dst1_l, dst1_l);
1193 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1194 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
1195 dst += (4 * dst_stride);
1221 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1222 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1223 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1224 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1225 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1226 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1227 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1228 v8i16 filter_vec, const_vec;
1229 v8i16 filt0, filt1, filt2, filt3;
1231 src -= (3 * src_stride);
1232 const_vec = __msa_ldi_h(128);
1236 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238 for (cnt =
width >> 4; cnt--;) {
1242 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1246 src10_r, src32_r, src54_r, src21_r);
1247 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1249 src10_l, src32_l, src54_l, src21_l);
1250 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1252 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1253 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1256 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1257 src76_r, src87_r, src98_r, src109_r);
1258 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1259 src76_l, src87_l, src98_l, src109_l);
1263 filt0, filt1, filt2, filt3,
1264 dst0_r, dst0_r, dst0_r, dst0_r);
1267 filt0, filt1, filt2, filt3,
1268 dst1_r, dst1_r, dst1_r, dst1_r);
1271 filt0, filt1, filt2, filt3,
1272 dst2_r, dst2_r, dst2_r, dst2_r);
1275 filt0, filt1, filt2, filt3,
1276 dst3_r, dst3_r, dst3_r, dst3_r);
1279 filt0, filt1, filt2, filt3,
1280 dst0_l, dst0_l, dst0_l, dst0_l);
1283 filt0, filt1, filt2, filt3,
1284 dst1_l, dst1_l, dst1_l, dst1_l);
1287 filt0, filt1, filt2, filt3,
1288 dst2_l, dst2_l, dst2_l, dst2_l);
1291 filt0, filt1, filt2, filt3,
1292 dst3_l, dst3_l, dst3_l, dst3_l);
1294 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1295 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1296 dst_tmp += (4 * dst_stride);
1319 int16_t *dst,
int32_t dst_stride,
1327 int16_t *dst,
int32_t dst_stride,
1337 int16_t *dst,
int32_t dst_stride,
1345 int16_t *dst,
int32_t dst_stride,
1353 int16_t *dst,
int32_t dst_stride,
1361 int16_t *dst,
int32_t dst_stride,
1362 const int8_t *filter_x,
const int8_t *filter_y,
1366 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1367 v8i16 filt0, filt1, filt2, filt3;
1368 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1369 v16i8 mask1, mask2, mask3;
1370 v8i16 filter_vec, const_vec;
1371 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1372 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1373 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1374 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1375 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1376 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1379 src -= ((3 * src_stride) + 3);
1380 filter_vec =
LD_SH(filter_x);
1381 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1383 filter_vec =
LD_SH(filter_y);
1386 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1392 const_vec = __msa_ldi_h(128);
1396 src += (7 * src_stride);
1399 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1401 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1402 vec8, vec9, vec10, vec11);
1403 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404 vec12, vec13, vec14, vec15);
1406 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1407 dst30, dst30, dst30, dst30);
1409 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1410 dst41, dst41, dst41, dst41);
1412 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1413 dst52, dst52, dst52, dst52);
1415 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1416 dst63, dst63, dst63, dst63);
1421 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1423 for (loop_cnt =
height >> 2; loop_cnt--;) {
1424 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1425 src += (4 * src_stride);
1428 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1429 vec0, vec1, vec2, vec3);
1430 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1431 vec4, vec5, vec6, vec7);
1434 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1435 dst97, dst97, dst97, dst97);
1436 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1437 dst108, dst108, dst108, dst108);
1439 dst76_r = __msa_ilvr_h(dst97, dst66);
1441 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1442 dst98_r = __msa_ilvr_h(dst66, dst108);
1445 filt_h0, filt_h1, filt_h2, filt_h3);
1447 filt_h0, filt_h1, filt_h2, filt_h3);
1449 filt_h0, filt_h1, filt_h2, filt_h3);
1451 filt_h0, filt_h1, filt_h2, filt_h3);
1452 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1453 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1454 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1455 dst += (4 * dst_stride);
1463 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1471 const int8_t *filter_x,
1472 const int8_t *filter_y,
1475 uint32_t loop_cnt, cnt;
1478 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1479 v8i16 filt0, filt1, filt2, filt3;
1480 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1481 v16i8 mask1, mask2, mask3;
1482 v8i16 filter_vec, const_vec;
1483 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1484 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1485 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1486 v4i32 dst0_r, dst0_l;
1487 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1488 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1489 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1491 src -= ((3 * src_stride) + 3);
1492 filter_vec =
LD_SH(filter_x);
1493 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1495 filter_vec =
LD_SH(filter_y);
1498 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1504 const_vec = __msa_ldi_h(128);
1507 for (cnt =
width >> 3; cnt--;) {
1511 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1512 src_tmp += (7 * src_stride);
1517 vec0, vec1, vec2, vec3);
1519 vec4, vec5, vec6, vec7);
1520 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1521 vec8, vec9, vec10, vec11);
1522 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1523 vec12, vec13, vec14, vec15);
1525 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1526 dst0, dst0, dst0, dst0);
1528 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1529 dst1, dst1, dst1, dst1);
1531 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1532 dst2, dst2, dst2, dst2);
1534 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1535 dst3, dst3, dst3, dst3);
1538 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1539 vec0, vec1, vec2, vec3);
1540 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1541 vec4, vec5, vec6, vec7);
1542 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1543 vec8, vec9, vec10, vec11);
1545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1546 dst4, dst4, dst4, dst4);
1548 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1549 dst5, dst5, dst5, dst5);
1551 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1552 dst6, dst6, dst6, dst6);
1554 for (loop_cnt =
height; loop_cnt--;) {
1555 src7 =
LD_SB(src_tmp);
1556 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1557 src_tmp += src_stride;
1559 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1560 vec0, vec1, vec2, vec3);
1562 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1563 dst7, dst7, dst7, dst7);
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1572 filt_h0, filt_h1, filt_h2, filt_h3);
1576 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1577 ST_SW(dst0_r, dst_tmp);
1578 dst_tmp += dst_stride;
1595 int16_t *dst,
int32_t dst_stride,
1596 const int8_t *filter_x,
const int8_t *filter_y,
1600 filter_x, filter_y,
height, 8);
1604 int16_t *dst,
int32_t dst_stride,
1605 const int8_t *filter_x,
const int8_t *filter_y,
1611 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1612 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1613 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1615 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1616 v8i16 filter_vec, const_vec;
1617 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1618 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1619 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1620 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1621 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1623 src -= ((3 * src_stride) + 3);
1624 filter_vec =
LD_SH(filter_x);
1625 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1627 filter_vec =
LD_SH(filter_y);
1630 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1637 const_vec = __msa_ldi_h(128);
1643 LD_SB7(src_tmp, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1644 src_tmp += (7 * src_stride);
1650 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1652 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1658 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1661 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1664 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1668 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1669 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1670 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1676 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1679 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1682 for (loop_cnt =
height; loop_cnt--;) {
1683 src7 =
LD_SB(src_tmp);
1684 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1685 src_tmp += src_stride;
1687 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1690 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1697 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1698 filt_h1, filt_h2, filt_h3);
1699 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1700 filt_h1, filt_h2, filt_h3);
1704 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1705 ST_SW(dst0_r, dst_tmp);
1706 dst_tmp += dst_stride;
1726 src += (7 * src_stride);
1729 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1730 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1731 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1733 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1736 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1737 dst30, dst30, dst30);
1739 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1740 dst41, dst41, dst41);
1742 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1743 dst52, dst52, dst52);
1745 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1746 dst63, dst63, dst63);
1752 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1754 for (loop_cnt =
height >> 2; loop_cnt--;) {
1755 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1756 src += (4 * src_stride);
1759 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1761 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1765 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1766 dst97, dst97, dst97);
1767 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1768 dst108, dst108, dst108);
1770 dst76_r = __msa_ilvr_h(dst97, dst66);
1772 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1773 dst98_r = __msa_ilvr_h(dst66, dst108);
1775 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1776 filt_h1, filt_h2, filt_h3);
1777 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1778 filt_h1, filt_h2, filt_h3);
1779 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1780 filt_h1, filt_h2, filt_h3);
1781 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1782 filt_h1, filt_h2, filt_h3);
1783 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1784 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1785 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1786 dst += (4 * dst_stride);
1794 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1799 int16_t *dst,
int32_t dst_stride,
1800 const int8_t *filter_x,
const int8_t *filter_y,
1804 filter_x, filter_y,
height, 16);
1808 int16_t *dst,
int32_t dst_stride,
1809 const int8_t *filter_x,
const int8_t *filter_y,
1813 filter_x, filter_y,
height, 24);
1817 int16_t *dst,
int32_t dst_stride,
1818 const int8_t *filter_x,
const int8_t *filter_y,
1822 filter_x, filter_y,
height, 32);
1826 int16_t *dst,
int32_t dst_stride,
1827 const int8_t *filter_x,
const int8_t *filter_y,
1831 filter_x, filter_y,
height, 48);
1835 int16_t *dst,
int32_t dst_stride,
1836 const int8_t *filter_x,
const int8_t *filter_y,
1840 filter_x, filter_y,
height, 64);
1851 v16i8 mask1, vec0, vec1;
1853 v8i16 filter_vec, const_vec;
1863 const_vec = __msa_ldi_h(128);
1873 ST_D2(dst0, 0, 1, dst, dst_stride);
1884 v16i8 mask1, vec0, vec1;
1886 v8i16 filter_vec, const_vec;
1896 const_vec = __msa_ldi_h(128);
1906 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1910 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1922 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1923 v16i8 mask1, vec0, vec1;
1924 v8i16 dst0, dst1, dst2, dst3;
1925 v8i16 filter_vec, const_vec;
1935 const_vec = __msa_ldi_h(128);
1938 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1940 src += (8 * src_stride);
1947 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1950 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1953 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1957 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1958 dst += (8 * dst_stride);
1971 }
else if (4 ==
height) {
1973 }
else if (0 ==
height % 8) {
1987 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1988 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1989 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1994 v8i16 filter_vec, const_vec;
2003 const_vec = __msa_ldi_h(128);
2006 for (loop_cnt = 2; loop_cnt--;) {
2008 src += (4 * src_stride);
2018 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2021 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2025 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2026 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
2027 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
2028 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
2030 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
2031 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
2032 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
2033 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
2036 SW(dst_val_int0, dst + 4);
2039 SW(dst_val_int1, dst + 4);
2042 SW(dst_val_int2, dst + 4);
2045 SW(dst_val_int3, dst + 4);
2058 v8i16 filt0, filt1, dst0, dst1;
2063 v8i16 filter_vec, const_vec;
2072 const_vec = __msa_ldi_h(128);
2075 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2077 src += (2 * src_stride);
2089 ST_SH2(dst0, dst1, dst, dst_stride);
2090 dst += (2 * dst_stride);
2107 v8i16 dst0, dst1, dst2, dst3;
2108 v8i16 filter_vec, const_vec;
2117 const_vec = __msa_ldi_h(128);
2120 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2122 src += (4 * src_stride);
2134 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2138 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2142 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2143 dst += (4 * dst_stride);
2175 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2176 v8i16 filter_vec, const_vec;
2180 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2191 const_vec = __msa_ldi_h(128);
2194 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2196 src += (4 * src_stride);
2205 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2208 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2214 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2218 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2219 ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
2220 dst += (4 * dst_stride);
2233 v16i8 src4, src5, src6, src7;
2237 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2239 v8i16 filter_vec, const_vec;
2248 const_vec = __msa_ldi_h(128);
2251 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2254 src += (4 * src_stride);
2266 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2270 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2274 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2278 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2282 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2286 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2290 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2291 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2292 dst += (4 * dst_stride);
2304 int16_t *dst_tmp = dst + 16;
2305 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2308 v16i8 mask1, mask00, mask11;
2310 v8i16 dst0, dst1, dst2, dst3;
2311 v8i16 filter_vec, const_vec;
2320 mask11 = mask0 + 10;
2322 const_vec = __msa_ldi_h(128);
2325 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2329 src += (4 * src_stride);
2341 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2345 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2349 ST_SH2(dst0, dst1, dst, 8);
2351 ST_SH2(dst2, dst3, dst, 8);
2354 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2358 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2362 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2366 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2370 ST_SH2(dst0, dst1, dst, 8);
2372 ST_SH2(dst2, dst3, dst, 8);
2380 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2384 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2388 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2392 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2393 dst_tmp += (4 * dst_stride);
2408 v16i8 mask1, mask2, mask3;
2409 v8i16 dst0, dst1, dst2, dst3;
2410 v16i8 vec0, vec1, vec2, vec3;
2411 v8i16 filter_vec, const_vec;
2418 const_vec = __msa_ldi_h(128);
2425 for (loop_cnt =
height; loop_cnt--;) {
2438 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2442 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2444 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2455 v16i8
src0,
src1, src2, src3, src4;
2456 v16i8 src10_r, src32_r, src21_r, src43_r;
2457 v16i8 src2110, src4332;
2460 v8i16 filter_vec, const_vec;
2464 const_vec = __msa_ldi_h(128);
2472 src10_r, src21_r, src32_r, src43_r);
2474 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2477 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2479 ST_D2(dst10, 0, 1, dst, dst_stride);
2489 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2490 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2491 v16i8 src2110, src4332, src6554;
2494 v8i16 filter_vec, const_vec;
2498 const_vec = __msa_ldi_h(128);
2506 src10_r, src21_r, src32_r, src43_r);
2507 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2508 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2509 src2110, src4332, src6554);
2512 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2514 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2516 ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
2526 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2527 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2528 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2529 v16i8 src2110, src4332, src6554, src8776, src10998;
2530 v8i16 dst10, dst32, dst54, dst76;
2532 v8i16 filter_vec, const_vec;
2535 const_vec = __msa_ldi_h(128);
2542 src += (3 * src_stride);
2545 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2546 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2548 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2549 src += (8 * src_stride);
2550 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2551 src32_r, src43_r, src54_r, src65_r);
2552 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
2553 src76_r, src87_r, src98_r, src109_r);
2554 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2555 src98_r, src4332, src6554, src8776, src10998);
2561 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2562 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2563 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2564 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2565 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2569 int16_t *dst,
int32_t dst_stride,
2572 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2573 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
2574 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
2576 v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
2579 const_vec = __msa_ldi_h(128);
2586 src += (3 * src_stride);
2589 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2590 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2592 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2593 src += (8 * src_stride);
2594 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2596 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2597 src87_r, src98_r, src109_r);
2598 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2599 src98_r, src4332, src6554, src8776, src10998);
2606 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2607 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2608 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2609 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2610 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2611 dst += (8 * dst_stride);
2616 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2617 src += (8 * src_stride);
2619 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2621 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2622 src87_r, src98_r, src109_r);
2623 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2624 src98_r, src4332, src6554, src8776, src10998);
2631 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2632 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2633 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2634 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2635 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2647 }
else if (4 ==
height) {
2649 }
else if (8 ==
height) {
2651 }
else if (16 ==
height) {
2664 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2665 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2666 v16i8
src0,
src1, src2, src3, src4;
2667 v16i8 src10_r, src32_r, src21_r, src43_r;
2668 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2670 v8i16 filter_vec, const_vec;
2673 const_vec = __msa_ldi_h(128);
2680 src += (3 * src_stride);
2684 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2686 src += (2 * src_stride);
2688 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2691 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2693 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2696 src += (2 * src_stride);
2701 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2703 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2705 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2706 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2707 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2708 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2710 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2711 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2712 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2713 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2716 SW(dst_val_int0, dst + 4);
2719 SW(dst_val_int1, dst + 4);
2722 SW(dst_val_int2, dst + 4);
2725 SW(dst_val_int3, dst + 4);
2736 v16i8
src0,
src1, src2, src3, src4;
2737 v16i8 src10_r, src32_r, src21_r, src43_r;
2738 v8i16 dst0_r, dst1_r;
2740 v8i16 filter_vec, const_vec;
2743 const_vec = __msa_ldi_h(128);
2750 src += (3 * src_stride);
2756 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2758 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2760 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2762 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2771 v16i8
src0,
src1, src2, src3, src4;
2772 v16i8 src10_r, src32_r, src21_r, src43_r;
2773 v8i16 dst0_r, dst1_r;
2775 v8i16 filter_vec, const_vec;
2778 const_vec = __msa_ldi_h(128);
2785 src += (3 * src_stride);
2790 src += (2 * src_stride);
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2797 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2799 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2800 dst += (2 * dst_stride);
2803 src += (2 * src_stride);
2808 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2810 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2812 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2813 dst += (2 * dst_stride);
2818 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2820 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2822 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2824 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2835 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2836 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2837 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2839 v8i16 filter_vec, const_vec;
2842 const_vec = __msa_ldi_h(128);
2849 src += (3 * src_stride);
2853 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2854 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2855 src += (4 * src_stride);
2857 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2858 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2863 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2864 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2865 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2866 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2867 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2868 dst += (4 * dst_stride);
2885 }
else if (6 ==
height) {
2901 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2902 v16i8 src10_r, src32_r, src21_r, src43_r;
2903 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2904 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2905 v16i8 src2110, src4332;
2906 v16i8 src54_r, src65_r, src6554;
2907 v8i16 dst0_l, dst1_l;
2909 v8i16 filter_vec, const_vec;
2911 src -= (1 * src_stride);
2912 const_vec = __msa_ldi_h(128);
2919 src += (3 * src_stride);
2923 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2925 for (loop_cnt = 4; loop_cnt--;) {
2927 src += (2 * src_stride);
2929 src += (2 * src_stride);
2933 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2934 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2935 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2936 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2937 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2938 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2941 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2943 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2945 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2947 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2949 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2951 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
2953 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2954 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
2955 dst += (4 * dst_stride);
2972 v16i8
src0,
src1, src2, src3, src4, src5;
2973 v16i8 src10_r, src32_r, src21_r, src43_r;
2974 v16i8 src10_l, src32_l, src21_l, src43_l;
2975 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2977 v8i16 filter_vec, const_vec;
2980 const_vec = __msa_ldi_h(128);
2987 src += (3 * src_stride);
2992 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2994 src += (2 * src_stride);
2996 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2997 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2999 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3001 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3003 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3005 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3006 ST_SH2(dst0_r, dst0_l, dst, 8);
3008 ST_SH2(dst1_r, dst1_l, dst, 8);
3012 src += (2 * src_stride);
3014 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3015 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3017 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3019 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3021 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3023 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3024 ST_SH2(dst0_r, dst0_l, dst, 8);
3026 ST_SH2(dst1_r, dst1_l, dst, 8);
3039 v16i8
src0,
src1, src2, src3, src4, src5;
3040 v16i8 src6, src7, src8, src9, src10, src11;
3041 v16i8 src10_r, src32_r, src76_r, src98_r;
3042 v16i8 src21_r, src43_r, src87_r, src109_r;
3043 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3044 v16i8 src10_l, src32_l, src21_l, src43_l;
3045 v8i16 dst0_l, dst1_l;
3047 v8i16 filter_vec, const_vec;
3050 const_vec = __msa_ldi_h(128);
3061 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3062 src += (3 * src_stride);
3064 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3066 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3069 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3070 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3072 LD_SB2(
src + 16, src_stride, src9, src10);
3073 src += (2 * src_stride);
3075 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3078 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3080 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3082 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3084 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3086 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3088 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3090 ST_SH2(dst0_r, dst0_l, dst, 8);
3091 ST_SH(dst2_r, dst + 16);
3093 ST_SH2(dst1_r, dst1_l, dst, 8);
3094 ST_SH(dst3_r, dst + 16);
3099 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3100 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3102 LD_SB2(
src + 16, src_stride, src11, src8);
3103 src += (2 * src_stride);
3105 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3108 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3110 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3112 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3114 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3116 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3118 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3120 ST_SH2(dst0_r, dst0_l, dst, 8);
3121 ST_SH(dst2_r, dst + 16);
3123 ST_SH2(dst1_r, dst1_l, dst, 8);
3124 ST_SH(dst3_r, dst + 16);
3137 v16i8
src0,
src1, src2, src3, src4, src5;
3138 v16i8 src6, src7, src8, src9, src10, src11;
3139 v16i8 src10_r, src32_r, src76_r, src98_r;
3140 v16i8 src21_r, src43_r, src87_r, src109_r;
3141 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3142 v16i8 src10_l, src32_l, src76_l, src98_l;
3143 v16i8 src21_l, src43_l, src87_l, src109_l;
3144 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3146 v8i16 filter_vec, const_vec;
3149 const_vec = __msa_ldi_h(128);
3160 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3161 src += (3 * src_stride);
3163 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3164 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3166 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3169 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3170 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3172 LD_SB2(
src + 16, src_stride, src9, src10);
3173 src += (2 * src_stride);
3175 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3176 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3179 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3181 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3183 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3185 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3187 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3189 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3191 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3193 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3195 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3197 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3202 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3203 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3205 LD_SB2(
src + 16, src_stride, src11, src8);
3206 src += (2 * src_stride);
3208 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3209 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3212 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3214 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3216 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3218 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3220 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3222 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3224 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3226 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3228 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3230 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3239 const int8_t *filter_x,
3240 const int8_t *filter_y)
3242 v16i8
src0,
src1, src2, src3, src4;
3244 v8i16 filt_h0, filt_h1;
3247 v8i16 filter_vec, const_vec;
3248 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3249 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3252 src -= (src_stride + 1);
3253 filter_vec =
LD_SH(filter_x);
3256 filter_vec =
LD_SH(filter_y);
3263 const_vec = __msa_ldi_h(128);
3270 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3285 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3286 ST_D2(dst0, 0, 1, dst, dst_stride);
3293 const int8_t *filter_x,
3294 const int8_t *filter_y)
3296 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3298 v8i16 filt_h0, filt_h1;
3301 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3302 v8i16 filter_vec, const_vec;
3303 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3304 v4i32 dst0, dst1, dst2, dst3;
3306 src -= (src_stride + 1);
3308 filter_vec =
LD_SH(filter_x);
3311 filter_vec =
LD_SH(filter_y);
3318 const_vec = __msa_ldi_h(128);
3326 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3327 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3346 SRA_4V(dst0, dst1, dst2, dst3, 6);
3348 ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
3356 const int8_t *filter_x,
3357 const int8_t *filter_y,
3361 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3362 v16i8 src7, src8, src9, src10;
3364 v8i16 filt_h0, filt_h1;
3367 v8i16 filter_vec, const_vec;
3368 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3369 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3370 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3371 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3372 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3374 src -= (src_stride + 1);
3375 filter_vec =
LD_SH(filter_x);
3378 filter_vec =
LD_SH(filter_y);
3385 const_vec = __msa_ldi_h(128);
3389 src += (3 * src_stride);
3398 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3400 for (loop_cnt =
height >> 3; loop_cnt--;) {
3402 src3, src4, src5, src6, src7, src8, src9, src10);
3403 src += (8 * src_stride);
3406 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3407 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3408 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3409 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3418 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3420 dst32_r = __msa_ilvr_h(dst73, dst22);
3424 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3425 dst76_r = __msa_ilvr_h(dst22, dst106);
3435 SRA_4V(dst0, dst1, dst2, dst3, 6);
3436 SRA_4V(dst4, dst5, dst6, dst7, 6);
3437 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3438 dst0, dst1, dst2, dst3);
3439 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3440 dst += (8 * dst_stride);
3444 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3452 const int8_t *filter_x,
3453 const int8_t *filter_y,
3458 filter_x, filter_y);
3459 }
else if (4 ==
height) {
3461 filter_x, filter_y);
3462 }
else if (0 == (
height % 8)) {
3464 filter_x, filter_y,
height);
3472 const int8_t *filter_x,
3473 const int8_t *filter_y,
3476 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3478 v8i16 filt_h0, filt_h1;
3481 v8i16 filter_vec, const_vec;
3482 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3483 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3484 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3485 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3486 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3487 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3488 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3489 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3490 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3492 src -= (src_stride + 1);
3493 filter_vec =
LD_SH(filter_x);
3496 filter_vec =
LD_SH(filter_y);
3503 const_vec = __msa_ldi_h(128);
3507 src += (3 * src_stride);
3512 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3524 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3527 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3528 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3529 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3530 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3541 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3542 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3543 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3544 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3553 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3564 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3565 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3566 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3579 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3580 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3581 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3582 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3583 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3584 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3585 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3586 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3587 ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
3588 dst += 4 * dst_stride;
3589 ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
3590 ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
3597 const int8_t *filter_x,
3598 const int8_t *filter_y)
3600 v16i8
src0,
src1, src2, src3, src4;
3602 v8i16 filt_h0, filt_h1;
3605 v8i16 filter_vec, const_vec;
3606 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3607 v8i16 dst0, dst1, dst2, dst3, dst4;
3608 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3609 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3610 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3612 src -= (src_stride + 1);
3614 filter_vec =
LD_SH(filter_x);
3617 filter_vec =
LD_SH(filter_y);
3624 const_vec = __msa_ldi_h(128);
3632 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3633 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3634 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3655 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3656 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3657 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3661 int16_t *dst,
int32_t dst_stride,
3662 const int8_t *filter_x,
3663 const int8_t *filter_y,
int32_t width8mult)
3666 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
3667 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3668 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3669 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3670 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3671 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3672 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3674 src -= (src_stride + 1);
3676 filter_vec =
LD_SH(filter_x);
3679 filter_vec =
LD_SH(filter_y);
3687 const_vec = __msa_ldi_h(128);
3690 for (cnt = width8mult; cnt--;) {
3697 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3709 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3710 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3711 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3712 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3734 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3735 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3736 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3737 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3739 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3748 const int8_t *filter_x,
3749 const int8_t *filter_y)
3751 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3753 v8i16 filt_h0, filt_h1;
3756 v8i16 filter_vec, const_vec;
3757 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3758 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3759 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3760 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3761 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3762 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3763 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3764 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3765 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3767 src -= (src_stride + 1);
3769 filter_vec =
LD_SH(filter_x);
3772 filter_vec =
LD_SH(filter_y);
3779 const_vec = __msa_ldi_h(128);
3783 src += (5 * src_stride);
3784 LD_SB4(
src, src_stride, src5, src6, src7, src8);
3791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3792 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3793 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3794 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3795 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3796 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3797 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3840 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3841 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3842 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3845 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3846 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3848 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3849 dst += (2 * dst_stride);
3850 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3851 dst += (2 * dst_stride);
3852 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3859 const int8_t *filter_x,
3860 const int8_t *filter_y,
3864 uint32_t loop_cnt, cnt;
3867 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3869 v8i16 filt_h0, filt_h1;
3872 v8i16 filter_vec, const_vec;
3873 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3874 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3875 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3876 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3877 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3879 src -= (src_stride + 1);
3881 filter_vec =
LD_SH(filter_x);
3884 filter_vec =
LD_SH(filter_y);
3891 const_vec = __msa_ldi_h(128);
3894 for (cnt = width8mult; cnt--;) {
3899 src_tmp += (3 * src_stride);
3905 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3917 for (loop_cnt =
height >> 2; loop_cnt--;) {
3918 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3919 src_tmp += (4 * src_stride);
3922 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3923 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3924 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3925 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3950 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3951 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3954 dst2_l, dst2_r, dst3_l, dst3_r,
3955 dst0_r, dst1_r, dst2_r, dst3_r);
3957 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
3958 dst_tmp += (4 * dst_stride);
3976 const int8_t *filter_x,
3977 const int8_t *filter_y,
3983 filter_x, filter_y);
3984 }
else if (4 ==
height) {
3986 filter_x, filter_y, 1);
3987 }
else if (6 ==
height) {
3989 filter_x, filter_y);
3990 }
else if (0 == (
height % 4)) {
3992 filter_x, filter_y,
height, 1);
4000 const int8_t *filter_x,
4001 const int8_t *filter_y,
4007 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4008 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4009 v16i8 mask0, mask1, mask2, mask3;
4010 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4011 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
4012 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
4013 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4014 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4015 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4016 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4018 src -= (src_stride + 1);
4020 filter_vec =
LD_SH(filter_x);
4023 filter_vec =
LD_SH(filter_y);
4031 const_vec = __msa_ldi_h(128);
4038 src_tmp += (3 * src_stride);
4044 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4056 for (loop_cnt = 4; loop_cnt--;) {
4057 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4058 src_tmp += (4 * src_stride);
4061 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4063 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4064 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4089 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4090 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4091 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4092 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4093 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4094 dst_tmp += (4 * dst_stride);
4110 src += (3 * src_stride);
4119 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4121 for (loop_cnt = 2; loop_cnt--;) {
4122 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4124 src += (8 * src_stride);
4126 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4127 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4128 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4129 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4138 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4140 dst32_r = __msa_ilvr_h(dst73, dst22);
4144 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4145 dst76_r = __msa_ilvr_h(dst22, dst106);
4156 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4157 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4158 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4160 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4161 dst += (8 * dst_stride);
4165 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4173 const int8_t *filter_x,
4174 const int8_t *filter_y,
4179 filter_x, filter_y, 2);
4182 filter_x, filter_y,
height, 2);
4190 const int8_t *filter_x,
4191 const int8_t *filter_y,
4195 filter_x, filter_y,
height, 3);
4202 const int8_t *filter_x,
4203 const int8_t *filter_y,
4207 filter_x, filter_y,
height, 4);
4210 #define MC_COPY(WIDTH) \
4211 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
4213 ptrdiff_t src_stride, \
4219 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
4234 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4235 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4237 ptrdiff_t src_stride, \
4243 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4245 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4246 MAX_PB_SIZE, filter, height); \
4249 MC(qpel,
h, 4, 8, hz, mx);
4250 MC(qpel,
h, 8, 8, hz, mx);
4251 MC(qpel,
h, 12, 8, hz, mx);
4252 MC(qpel,
h, 16, 8, hz, mx);
4253 MC(qpel,
h, 24, 8, hz, mx);
4254 MC(qpel,
h, 32, 8, hz, mx);
4255 MC(qpel,
h, 48, 8, hz, mx);
4256 MC(qpel,
h, 64, 8, hz, mx);
4258 MC(qpel, v, 4, 8, vt, my);
4259 MC(qpel, v, 8, 8, vt, my);
4260 MC(qpel, v, 12, 8, vt, my);
4261 MC(qpel, v, 16, 8, vt, my);
4262 MC(qpel, v, 24, 8, vt, my);
4263 MC(qpel, v, 32, 8, vt, my);
4264 MC(qpel, v, 48, 8, vt, my);
4265 MC(qpel, v, 64, 8, vt, my);
4267 MC(epel,
h, 4, 4, hz, mx);
4268 MC(epel,
h, 6, 4, hz, mx);
4269 MC(epel,
h, 8, 4, hz, mx);
4270 MC(epel,
h, 12, 4, hz, mx);
4271 MC(epel,
h, 16, 4, hz, mx);
4272 MC(epel,
h, 24, 4, hz, mx);
4273 MC(epel,
h, 32, 4, hz, mx);
4275 MC(epel, v, 4, 4, vt, my);
4276 MC(epel, v, 6, 4, vt, my);
4277 MC(epel, v, 8, 4, vt, my);
4278 MC(epel, v, 12, 4, vt, my);
4279 MC(epel, v, 16, 4, vt, my);
4280 MC(epel, v, 24, 4, vt, my);
4281 MC(epel, v, 32, 4, vt, my);
4285 #define MC_HV(PEL, WIDTH, TAP) \
4286 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \
4288 ptrdiff_t src_stride, \
4294 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4295 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4297 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4298 filter_x, filter_y, height); \