28 uint32_t data0, data1;
32 v8i16 temp0, temp1, vec0, vec1, wgt, denom,
offset;
35 offset_in <<= (log2_denom);
38 offset_in += (1 << (log2_denom - 1));
41 wgt = __msa_fill_h(src_weight);
42 offset = __msa_fill_h(offset_in);
43 denom = __msa_fill_h(log2_denom);
46 data1 =
LW(data + stride);
48 src0 = (v16u8) __msa_fill_w(data0);
49 src1 = (v16u8) __msa_fill_w(data1);
51 ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
52 MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
53 ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
56 out0 = (v8u16) __msa_srl_h(temp0, denom);
57 out1 = (v8u16) __msa_srl_h(temp1, denom);
62 data0 = __msa_copy_u_w(res0, 0);
63 data1 = __msa_copy_u_w(res1, 0);
74 uint32_t data0, data1, data2, data3;
77 v8u16 temp0, temp1, temp2, temp3, wgt;
80 offset_in <<= (log2_denom);
83 offset_in += (1 << (log2_denom - 1));
86 wgt = (v8u16) __msa_fill_h(src_weight);
87 offset = __msa_fill_h(offset_in);
88 denom = __msa_fill_h(log2_denom);
90 for (cnt = height / 4; cnt--;) {
91 LW4(data, stride, data0, data1, data2, data3);
93 src0 = (v16u8) __msa_fill_w(data0);
94 src1 = (v16u8) __msa_fill_w(data1);
95 src2 = (v16u8) __msa_fill_w(data2);
96 src3 = (v16u8) __msa_fill_w(data3);
98 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
99 temp0, temp1, temp2, temp3);
100 MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
101 temp0, temp1, temp2, temp3);
102 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
103 temp0, temp1, temp2, temp3);
105 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
131 v8u16 src0_r, src1_r, src2_r, src3_r;
132 v8u16 temp0, temp1, temp2, temp3;
136 offset_in <<= (log2_denom);
139 offset_in += (1 << (log2_denom - 1));
142 wgt = (v8u16) __msa_fill_h(src_weight);
143 offset = (v8u16) __msa_fill_h(offset_in);
144 denom = (v8u16) __msa_fill_h(log2_denom);
146 for (cnt = height / 4; cnt--;) {
147 LD_UB4(data, stride, src0, src1, src2, src3);
148 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
149 src0_r, src1_r, src2_r, src3_r);
150 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
151 temp0, temp1, temp2, temp3);
152 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
153 temp0, temp1, temp2, temp3);
155 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
157 PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
170 v16u8 dst0, dst1, dst2, dst3;
171 v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
172 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
175 offset_in <<= (log2_denom);
178 offset_in += (1 << (log2_denom - 1));
181 wgt = (v8u16) __msa_fill_h(src_weight);
182 offset = (v8u16) __msa_fill_h(offset_in);
183 denom = (v8u16) __msa_fill_h(log2_denom);
185 for (cnt = height / 4; cnt--;) {
186 LD_UB4(data, stride, src0, src1, src2, src3);
187 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
188 src0_r, src1_r, src2_r, src3_r);
189 ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
190 src0_l, src1_l, src2_l, src3_l);
191 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
192 temp0, temp1, temp2, temp3);
193 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
194 temp4, temp5, temp6, temp7);
195 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
196 temp0, temp1, temp2, temp3);
197 ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
198 temp4, temp5, temp6, temp7);
201 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
202 SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
205 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
206 dst0, dst1, dst2, dst3);
207 ST_UB4(dst0, dst1, dst2, dst3, data, stride);
217 uint32_t load0, load1, out0, out1;
218 v16i8 src_wgt, dst_wgt, wgt;
220 v8i16 temp0, temp1, denom,
offset, add_val;
221 int32_t val = 128 * (src_weight + dst_weight);
223 offset_in = ((offset_in + 1) | 1) << log2_denom;
225 src_wgt = __msa_fill_b(src_weight);
226 dst_wgt = __msa_fill_b(dst_weight);
227 offset = __msa_fill_h(offset_in);
228 denom = __msa_fill_h(log2_denom + 1);
229 add_val = __msa_fill_h(val);
232 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
238 src0 = (v16i8) __msa_fill_w(load0);
239 src1 = (v16i8) __msa_fill_w(load1);
242 load1 =
LW(dst + dst_stride);
244 dst0 = (v16i8) __msa_fill_w(load0);
245 dst1 = (v16i8) __msa_fill_w(load1);
248 ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
250 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
251 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
257 PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
259 out0 = __msa_copy_u_w((v4i32) dst0, 0);
260 out1 = __msa_copy_u_w((v4i32) dst1, 0);
273 uint32_t load0, load1, load2, load3;
274 v16i8 src_wgt, dst_wgt, wgt;
276 v16i8 dst0, dst1, dst2, dst3;
277 v8i16 temp0, temp1, temp2, temp3;
278 v8i16 denom,
offset, add_val;
279 int32_t val = 128 * (src_weight + dst_weight);
281 offset_in = ((offset_in + 1) | 1) << log2_denom;
283 src_wgt = __msa_fill_b(src_weight);
284 dst_wgt = __msa_fill_b(dst_weight);
285 offset = __msa_fill_h(offset_in);
286 denom = __msa_fill_h(log2_denom + 1);
287 add_val = __msa_fill_h(val);
290 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
292 for (cnt = height / 4; cnt--;) {
293 LW4(src, src_stride, load0, load1, load2, load3);
294 src += (4 * src_stride);
296 src0 = (v16i8) __msa_fill_w(load0);
297 src1 = (v16i8) __msa_fill_w(load1);
298 src2 = (v16i8) __msa_fill_w(load2);
299 src3 = (v16i8) __msa_fill_w(load3);
301 LW4(dst, dst_stride, load0, load1, load2, load3);
303 dst0 = (v16i8) __msa_fill_w(load0);
304 dst1 = (v16i8) __msa_fill_w(load1);
305 dst2 = (v16i8) __msa_fill_w(load2);
306 dst3 = (v16i8) __msa_fill_w(load3);
310 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
311 temp0, temp1, temp2, temp3);
313 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
314 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
315 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
316 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
318 SRA_4V(temp0, temp1, temp2, temp3, denom);
321 dst += (4 * dst_stride);
333 src_weight, dst_weight, offset_in);
336 log2_denom, src_weight, dst_weight,
348 v16i8 src_wgt, dst_wgt, wgt;
350 v16i8 dst0, dst1, dst2, dst3;
352 v8i16 temp0, temp1, temp2, temp3;
353 v8i16 denom,
offset, add_val;
354 int32_t val = 128 * (src_weight + dst_weight);
356 offset_in = ((offset_in + 1) | 1) << log2_denom;
358 src_wgt = __msa_fill_b(src_weight);
359 dst_wgt = __msa_fill_b(dst_weight);
360 offset = __msa_fill_h(offset_in);
361 denom = __msa_fill_h(log2_denom + 1);
362 add_val = __msa_fill_h(val);
365 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
367 for (cnt = height / 4; cnt--;) {
368 LD_SB4(src, src_stride, src0, src1, src2, src3);
369 src += (4 * src_stride);
371 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
374 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
375 temp0, temp1, temp2, temp3);
377 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
378 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
379 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
380 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
382 SRA_4V(temp0, temp1, temp2, temp3, denom);
384 PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
385 ST8x4_UB(out0, out1, dst, dst_stride);
386 dst += 4 * dst_stride;
397 v16i8 src_wgt, dst_wgt, wgt;
399 v16i8 dst0, dst1, dst2, dst3;
400 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
401 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
402 v8i16 denom,
offset, add_val;
403 int32_t val = 128 * (src_weight + dst_weight);
405 offset_in = ((offset_in + 1) | 1) << log2_denom;
407 src_wgt = __msa_fill_b(src_weight);
408 dst_wgt = __msa_fill_b(dst_weight);
409 offset = __msa_fill_h(offset_in);
410 denom = __msa_fill_h(log2_denom + 1);
411 add_val = __msa_fill_h(val);
414 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
416 for (cnt = height / 4; cnt--;) {
417 LD_SB4(src, src_stride, src0, src1, src2, src3);
418 src += (4 * src_stride);
420 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
423 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
424 vec0, vec2, vec4, vec6);
425 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
426 vec1, vec3, vec5, vec7);
428 temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
429 temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
430 temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
431 temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
432 temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
433 temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
434 temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
435 temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
437 SRA_4V(temp0, temp1, temp2, temp3, denom);
438 SRA_4V(temp4, temp5, temp6, temp7, denom);
441 PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
442 dst0, dst1, dst2, dst3);
443 ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
444 dst += 4 * dst_stride;
448 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
449 q3_or_p3_org_in, p1_or_q1_org_in, \
450 p2_or_q2_org_in, q1_or_p1_org_in, \
451 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
454 v8i16 const3 = __msa_ldi_h(3); \
456 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
457 threshold += (p1_or_q1_org_in); \
459 (p0_or_q0_out) = threshold << 1; \
460 (p0_or_q0_out) += (p2_or_q2_org_in); \
461 (p0_or_q0_out) += (q1_or_p1_org_in); \
462 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
464 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
465 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
467 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
468 (p2_or_q2_out) += (p3_or_q3_org_in); \
469 (p2_or_q2_out) += (p3_or_q3_org_in); \
470 (p2_or_q2_out) += threshold; \
471 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
475 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
476 p1_or_q1_org_in, p0_or_q0_out) \
478 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
479 (p0_or_q0_out) += (p1_or_q1_org_in); \
480 (p0_or_q0_out) += (p1_or_q1_org_in); \
481 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
484 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
485 p1_or_q1_org_in, p2_or_q2_org_in, \
486 negate_tc_in, tc_in, p1_or_q1_out) \
490 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
491 (v8u16) q0_or_p0_org_in); \
492 temp = p1_or_q1_org_in << 1; \
493 clip3 = clip3 - temp; \
494 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
495 clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \
496 p1_or_q1_out = p1_or_q1_org_in + clip3; \
499 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
500 p1_or_q1_org_in, q1_or_p1_org_in, \
501 negate_threshold_in, threshold_in, \
502 p0_or_q0_out, q0_or_p0_out) \
504 v8i16 q0_sub_p0, p1_sub_q1, delta; \
506 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
507 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
510 delta = q0_sub_p0 + p1_sub_q1; \
513 delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \
515 p0_or_q0_out = p0_or_q0_org_in + delta; \
516 q0_or_p0_out = q0_or_p0_org_in - delta; \
518 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
521 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
523 uint32_t load0, load1, load2, load3; \
524 v16u8 src0 = { 0 }; \
525 v16u8 src1 = { 0 }; \
526 v16u8 src2 = { 0 }; \
527 v16u8 src3 = { 0 }; \
528 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
529 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
530 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
531 v8i16 res0_r, res1_r; \
532 v16i8 zeros = { 0 }; \
535 LW4((src - 2), stride, load0, load1, load2, load3); \
536 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
537 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
538 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
539 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
541 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
543 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
544 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
545 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
547 tc = __msa_fill_h(tc_val); \
549 is_less_than_alpha = (p0_asub_q0 < alpha); \
550 is_less_than_beta = (p1_asub_p0 < beta); \
551 is_less_than = is_less_than_alpha & is_less_than_beta; \
552 is_less_than_beta = (q1_asub_q0 < beta); \
553 is_less_than = is_less_than_beta & is_less_than; \
555 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
556 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
559 delta = q0_sub_p0 + p1_sub_q1; \
560 delta = __msa_srari_h(delta, 3); \
562 delta = CLIP_SH(delta, -tc, tc); \
564 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
569 CLIP_SH2_0_255(res0_r, res1_r); \
570 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
572 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
573 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
575 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
578 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
580 v16i8 zero_m = { 0 }; \
582 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
583 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
584 SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
587 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
589 uint32_t load0, load1; \
590 v16u8 src0 = { 0 }; \
591 v16u8 src1 = { 0 }; \
592 v16u8 src2 = { 0 }; \
593 v16u8 src3 = { 0 }; \
594 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
595 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
596 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
597 v16i8 zeros = { 0 }; \
600 load0 = LW(src - 2); \
601 load1 = LW(src - 2 + stride); \
603 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
604 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
606 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
608 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
609 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
610 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
612 tc = __msa_fill_h(tc_val); \
614 is_less_than_alpha = (p0_asub_q0 < alpha); \
615 is_less_than_beta = (p1_asub_p0 < beta); \
616 is_less_than = is_less_than_alpha & is_less_than_beta; \
617 is_less_than_beta = (q1_asub_q0 < beta); \
618 is_less_than = is_less_than_beta & is_less_than; \
620 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
621 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
624 delta = q0_sub_p0 + p1_sub_q1; \
625 delta = __msa_srari_h(delta, 3); \
626 delta = CLIP_SH(delta, -tc, tc); \
628 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
633 CLIP_SH2_0_255(res0_r, res1_r); \
634 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
636 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
637 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
639 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
647 v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
649 v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
650 v16u8 p2, p1, p0,
q0,
q1, q2;
651 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
652 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
653 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
669 alpha = (v16u8) __msa_fill_b(alpha_in);
670 beta = (v16u8) __msa_fill_b(beta_in);
672 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
675 v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
677 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
678 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
679 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
681 is_less_than_alpha = (p0_asub_q0 <
alpha);
682 is_less_than_beta = (p1_asub_p0 < beta);
683 is_less_than = is_less_than_beta & is_less_than_alpha;
684 is_less_than_beta = (q1_asub_q0 < beta);
685 is_less_than = is_less_than_beta & is_less_than;
688 if (!__msa_test_bz_v(is_less_than)) {
689 q2_org =
LD_UB(data + (2 * img_width));
690 p3_org =
LD_UB(data - (img_width << 2));
691 p2_org =
LD_UB(data - (3 * img_width));
697 tmp_flag = alpha >> 2;
698 tmp_flag = tmp_flag + 2;
699 tmp_flag = (p0_asub_q0 < tmp_flag);
701 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
702 is_less_than_beta = (p2_asub_p0 < beta);
703 is_less_than_beta = is_less_than_beta & tmp_flag;
704 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
705 is_less_than_beta = is_less_than_beta & is_less_than;
706 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
708 v8u16 is_less_than_beta_l, is_less_than_beta_r;
710 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
712 is_less_than_beta_r =
713 (v8u16) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
714 if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
717 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
719 p2_r, q1_org_r, p0_r, p1_r, p2_r);
722 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
724 is_less_than_beta_l =
725 (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
727 if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
730 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
732 p2_l, q1_org_l, p0_l, p1_l, p2_l);
736 if (!__msa_test_bz_v(is_less_than_beta)) {
737 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
739 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
740 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
741 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
743 ST_UB(p1_org, data - (2 * img_width));
744 ST_UB(p2_org, data - (3 * img_width));
747 v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
749 negate_is_less_than_beta_r =
750 (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta,
zero, 8);
751 if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
755 negate_is_less_than_beta_l =
756 (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
757 if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
762 if (!__msa_test_bz_v(negate_is_less_than_beta)) {
763 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
764 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767 ST_UB(p0_org, data - img_width);
770 q3_org =
LD_UB(data + (3 * img_width));
771 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772 is_less_than_beta = (q2_asub_q0 < beta);
773 is_less_than_beta = is_less_than_beta & tmp_flag;
774 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775 is_less_than_beta = is_less_than_beta & is_less_than;
776 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
779 v8u16 is_less_than_beta_l, is_less_than_beta_r;
780 is_less_than_beta_r =
781 (v8u16) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
782 if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
785 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
787 q2_r, p1_org_r, q0_r, q1_r, q2_r);
789 is_less_than_beta_l =
790 (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
791 if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
794 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
796 q2_l, p1_org_l, q0_l, q1_l, q2_l);
801 if (!__msa_test_bz_v(is_less_than_beta)) {
802 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
803 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
804 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
805 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
807 ST_UB(q1_org, data + img_width);
808 ST_UB(q2_org, data + 2 * img_width);
811 v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
812 negate_is_less_than_beta_r =
813 (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta,
zero, 8);
814 if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
818 negate_is_less_than_beta_l =
819 (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
820 if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
825 if (!__msa_test_bz_v(negate_is_less_than_beta)) {
826 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
827 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
839 v16u8
alpha, beta, p0_asub_q0;
840 v16u8 is_less_than_alpha, is_less_than;
841 v16u8 is_less_than_beta, negate_is_less_than_beta;
842 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
843 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
844 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
862 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
863 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
865 LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
866 LD_UB8(src + (8 * img_width), img_width,
867 row8, row9, row10, row11, row12, row13, row14, row15);
870 row4, row5, row6, row7,
871 row8, row9, row10, row11,
872 row12, row13, row14, row15,
873 p3_org, p2_org, p1_org, p0_org,
874 q0_org, q1_org, q2_org, q3_org);
885 v16u8 p1_asub_p0, q1_asub_q0;
887 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
888 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
889 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
891 alpha = (v16u8) __msa_fill_b(alpha_in);
892 beta = (v16u8) __msa_fill_b(beta_in);
894 is_less_than_alpha = (p0_asub_q0 <
alpha);
895 is_less_than_beta = (p1_asub_p0 < beta);
896 is_less_than = is_less_than_beta & is_less_than_alpha;
897 is_less_than_beta = (q1_asub_q0 < beta);
898 is_less_than = is_less_than_beta & is_less_than;
901 if (!__msa_test_bz_v(is_less_than)) {
902 tmp_flag = alpha >> 2;
903 tmp_flag = tmp_flag + 2;
904 tmp_flag = (p0_asub_q0 < tmp_flag);
909 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
910 is_less_than_beta = (p2_asub_p0 < beta);
912 is_less_than_beta = tmp_flag & is_less_than_beta;
913 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
914 is_less_than_beta = is_less_than_beta & is_less_than;
915 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
919 v16u8 is_less_than_beta_r;
921 is_less_than_beta_r =
922 (v16u8) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
923 if (!__msa_test_bz_v(is_less_than_beta_r)) {
926 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
928 p2_r, q1_org_r, p0_r, p1_r, p2_r);
933 v16u8 is_less_than_beta_l;
935 is_less_than_beta_l =
936 (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
937 if (!__msa_test_bz_v(is_less_than_beta_l)) {
940 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
942 p2_l, q1_org_l, p0_l, p1_l, p2_l);
946 if (!__msa_test_bz_v(is_less_than_beta)) {
949 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
950 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
951 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
952 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
956 v16u8 negate_is_less_than_beta_r;
958 negate_is_less_than_beta_r =
959 (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta,
zero, 8);
961 if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
967 v16u8 negate_is_less_than_beta_l;
969 negate_is_less_than_beta_l =
970 (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
971 if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
976 if (!__msa_test_bz_v(negate_is_less_than_beta)) {
979 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
980 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
986 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
987 is_less_than_beta = (q2_asub_q0 < beta);
990 is_less_than_beta = is_less_than_beta & tmp_flag;
991 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
993 is_less_than_beta = is_less_than_beta & is_less_than;
994 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
998 v16u8 is_less_than_beta_r;
1000 is_less_than_beta_r =
1001 (v16u8) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
1002 if (!__msa_test_bz_v(is_less_than_beta_r)) {
1005 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
1007 q2_r, p1_org_r, q0_r, q1_r, q2_r);
1012 v16u8 is_less_than_beta_l;
1014 is_less_than_beta_l =
1015 (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1016 if (!__msa_test_bz_v(is_less_than_beta_l)) {
1019 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
1021 q2_l, p1_org_l, q0_l, q1_l, q2_l);
1025 if (!__msa_test_bz_v(is_less_than_beta)) {
1028 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
1029 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
1030 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1031 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
1036 v16u8 negate_is_less_than_beta_r;
1038 negate_is_less_than_beta_r =
1039 (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta,
zero, 8);
1040 if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
1046 v16u8 negate_is_less_than_beta_l;
1048 negate_is_less_than_beta_l =
1049 (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
1050 if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
1054 if (!__msa_test_bz_v(negate_is_less_than_beta)) {
1057 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
1058 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
1062 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1072 ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
1073 ST2x4_UB(tmp2, 0, src + 4, img_width);
1074 src += 4 * img_width;
1075 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
1076 ST2x4_UB(tmp2, 4, src + 4, img_width);
1077 src += 4 * img_width;
1079 ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
1080 ST2x4_UB(tmp5, 0, src + 4, img_width);
1081 src += 4 * img_width;
1082 ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
1083 ST2x4_UB(tmp5, 4, src + 4, img_width);
1091 uint64_t load0, load1;
1092 uint32_t out0, out2;
1093 uint16_t out1, out3;
1094 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1095 v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
1096 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
1097 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
1098 v8i16 tmp0, tmp1, tmp2, tmp3;
1100 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1101 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1102 v16u8 is_less_than_beta1, is_less_than_beta2;
1111 v16i8 zeros = { 0 };
1113 load0 =
LD(src - 4);
1114 load1 =
LD(src + stride - 4);
1115 src0 = (v16i8) __msa_insert_d((v2i64)
src0, 0, load0);
1116 src1 = (v16i8) __msa_insert_d((v2i64)
src1, 0, load1);
1118 load0 =
LD(src + (2 * stride) - 4);
1119 load1 =
LD(src + (3 * stride) - 4);
1120 src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1121 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1123 load0 =
LD(src + (4 * stride) - 4);
1124 load1 =
LD(src + (5 * stride) - 4);
1125 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1126 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1128 load0 =
LD(src + (6 * stride) - 4);
1129 load1 =
LD(src + (7 * stride) - 4);
1130 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1131 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1133 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1134 src0, src1, src2, src3);
1136 ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1137 ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1139 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1140 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1141 SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
1143 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1144 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1145 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1147 alpha = (v16u8) __msa_fill_b(alpha_in);
1148 beta = (v16u8) __msa_fill_b(beta_in);
1150 is_less_than_alpha = (p0_asub_q0 <
alpha);
1151 is_less_than_beta = (p1_asub_p0 < beta);
1152 is_less_than = is_less_than_alpha & is_less_than_beta;
1153 is_less_than_beta = (q1_asub_q0 < beta);
1154 is_less_than = is_less_than & is_less_than_beta;
1159 is_less_than_alpha = (p0_asub_q0 <
alpha);
1161 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1162 is_less_than_beta1 = (p2_asub_p0 < beta);
1163 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1164 is_less_than_beta2 = (q2_asub_q0 < beta);
1166 ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1167 src0_r, src1_r, src2_r, src3_r);
1168 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1169 src4_r, src5_r, src6_r, src7_r);
1171 dst2_x_r = src1_r + src2_r + src3_r;
1172 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1173 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1174 dst1_r = src0_r + src1_r + src2_r + src3_r;
1175 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1177 dst0_r = (2 * src6_r) + (3 * src0_r);
1178 dst0_r += src1_r + src2_r + src3_r;
1179 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1180 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1181 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1183 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1184 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1186 dst3_x_r = src2_r + src3_r + src4_r;
1187 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1188 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1189 dst4_r = src2_r + src3_r + src4_r + src5_r;
1190 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1192 dst5_r = (2 * src7_r) + (3 * src5_r);
1193 dst5_r += src4_r + src3_r + src2_r;
1194 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1195 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1196 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1198 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1199 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1201 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1202 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1203 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1204 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1206 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1208 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1209 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1210 dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1211 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1213 is_less_than = is_less_than_alpha & is_less_than;
1214 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1215 is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1216 dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1218 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1219 dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1220 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1221 is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1222 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1223 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1224 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1226 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1227 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1231 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1233 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1234 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1237 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1238 out1 = __msa_copy_u_h((v8i16) dst0, 2);
1239 out2 = __msa_copy_u_w((v4i32) dst1, 0);
1240 out3 = __msa_copy_u_h((v8i16) dst1, 2);
1242 SW(out0, (src - 3));
1243 SH(out1, (src + 1));
1245 SW(out2, (src - 3));
1246 SH(out3, (src + 1));
1249 out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1250 out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1251 out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1252 out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1254 SW(out0, (src - 3));
1255 SH(out1, (src + 1));
1257 SW(out2, (src - 3));
1258 SH(out3, (src + 1));
1261 out0 = __msa_copy_u_w((v4i32) dst4, 0);
1262 out1 = __msa_copy_u_h((v8i16) dst4, 2);
1263 out2 = __msa_copy_u_w((v4i32) dst5, 0);
1264 out3 = __msa_copy_u_h((v8i16) dst5, 2);
1266 SW(out0, (src - 3));
1267 SH(out1, (src + 1));
1269 SW(out2, (src - 3));
1270 SH(out3, (src + 1));
1273 out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1274 out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1275 out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1276 out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1278 SW(out0, (src - 3));
1279 SH(out1, (src + 1));
1281 SW(out2, (src - 3));
1282 SH(out3, (src + 1));
1292 v8i16 p0_or_q0, q0_or_p0;
1293 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1295 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1296 v16u8 is_less_than_alpha, is_less_than_beta;
1297 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1299 alpha = (v16u8) __msa_fill_b(alpha_in);
1300 beta = (v16u8) __msa_fill_b(beta_in);
1302 LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1303 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1305 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1306 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1307 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1309 is_less_than_alpha = (p0_asub_q0 <
alpha);
1310 is_less_than_beta = (p1_asub_p0 < beta);
1311 is_less_than = is_less_than_beta & is_less_than_alpha;
1312 is_less_than_beta = (q1_asub_q0 < beta);
1313 is_less_than = is_less_than_beta & is_less_than;
1315 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1317 if (!__msa_test_bz_v(is_less_than)) {
1318 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1319 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1322 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1325 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1327 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1329 ST_UB(q0_or_p0_org, data_cb_or_cr);
1330 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1340 v16u8
alpha, beta, is_less_than;
1341 v8i16 p0_or_q0, q0_or_p0;
1342 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1344 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1345 v16u8 is_less_than_alpha, is_less_than_beta;
1346 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1349 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1351 LD_UB8((data_cb_or_cr - 2), img_width,
1352 row0, row1, row2, row3, row4, row5, row6, row7);
1355 p1_or_q1_org, p0_or_q0_org,
1356 q0_or_p0_org, q1_or_p1_org);
1359 alpha = (v16u8) __msa_fill_b(alpha_in);
1360 beta = (v16u8) __msa_fill_b(beta_in);
1362 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1363 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1364 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1366 is_less_than_alpha = (p0_asub_q0 <
alpha);
1367 is_less_than_beta = (p1_asub_p0 < beta);
1368 is_less_than = is_less_than_beta & is_less_than_alpha;
1369 is_less_than_beta = (q1_asub_q0 < beta);
1370 is_less_than = is_less_than_beta & is_less_than;
1371 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1373 if (!__msa_test_bz_v(is_less_than)) {
1374 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1375 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1381 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1384 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1386 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1387 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1390 ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
1391 data_cb_or_cr += 4 * img_width;
1392 ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
1406 v16u8 beta, tmp_vec, bs = { 0 };
1408 v16u8 is_less_than, is_less_than_beta;
1409 v16u8 p1, p0,
q0,
q1;
1410 v8i16 p0_r, q0_r, p1_r = { 0 };
1412 v8i16 p0_l, q0_l, p1_l = { 0 };
1414 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1415 v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1416 v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1419 v16u8 is_bs_greater_than0;
1421 tmp_vec = (v16u8) __msa_fill_b(bs0);
1422 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1423 tmp_vec = (v16u8) __msa_fill_b(bs1);
1424 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1425 tmp_vec = (v16u8) __msa_fill_b(bs2);
1426 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1427 tmp_vec = (v16u8) __msa_fill_b(bs3);
1428 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1430 if (!__msa_test_bz_v(bs)) {
1431 tmp_vec = (v16u8) __msa_fill_b(tc0);
1432 tc = (v16u8) __msa_insve_w((v4i32)
tc, 0, (v4i32) tmp_vec);
1433 tmp_vec = (v16u8) __msa_fill_b(tc1);
1434 tc = (v16u8) __msa_insve_w((v4i32)
tc, 1, (v4i32) tmp_vec);
1435 tmp_vec = (v16u8) __msa_fill_b(tc2);
1436 tc = (v16u8) __msa_insve_w((v4i32)
tc, 2, (v4i32) tmp_vec);
1437 tmp_vec = (v16u8) __msa_fill_b(tc3);
1438 tc = (v16u8) __msa_insve_w((v4i32)
tc, 3, (v4i32) tmp_vec);
1440 is_bs_greater_than0 = (zero < bs);
1443 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1444 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1450 row0, row1, row2, row3, row4, row5, row6, row7);
1451 src += (8 * img_width);
1453 row8, row9, row10, row11, row12, row13, row14, row15);
1456 row8, row9, row10, row11,
1457 row12, row13, row14, row15,
1458 p3_org, p2_org, p1_org, p0_org,
1459 q0_org, q1_org, q2_org, q3_org);
1462 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha;
1463 v16u8 is_less_than_alpha;
1465 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1466 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1467 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1469 alpha = (v16u8) __msa_fill_b(alpha_in);
1470 beta = (v16u8) __msa_fill_b(beta_in);
1472 is_less_than_alpha = (p0_asub_q0 <
alpha);
1473 is_less_than_beta = (p1_asub_p0 < beta);
1474 is_less_than = is_less_than_beta & is_less_than_alpha;
1475 is_less_than_beta = (q1_asub_q0 < beta);
1476 is_less_than = is_less_than_beta & is_less_than;
1477 is_less_than = is_less_than & is_bs_greater_than0;
1479 if (!__msa_test_bz_v(is_less_than)) {
1480 v16i8 negate_tc, sign_negate_tc;
1481 v8i16 negate_tc_r, i16_negatetc_l;
1483 negate_tc = zero - (v16i8) tc;
1484 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1486 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1495 v16u8 is_less_than_beta_r, is_less_than_beta_l;
1497 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1498 is_less_than_beta = (p2_asub_p0 < beta);
1499 is_less_than_beta = is_less_than_beta & is_less_than;
1501 is_less_than_beta_r =
1502 (v16u8) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
1503 if (!__msa_test_bz_v(is_less_than_beta_r)) {
1504 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1507 negate_tc_r, tc_r, p1_r);
1510 is_less_than_beta_l =
1511 (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1512 if (!__msa_test_bz_v(is_less_than_beta_l)) {
1513 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1516 i16_negatetc_l, tc_l, p1_l);
1520 if (!__msa_test_bz_v(is_less_than_beta)) {
1521 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1522 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1524 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1525 tc = tc + is_less_than_beta;
1530 v16u8 is_less_than_beta_l, is_less_than_beta_r;
1532 u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1533 is_less_than_beta = (u8_q2asub_q0 < beta);
1534 is_less_than_beta = is_less_than_beta & is_less_than;
1536 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1538 is_less_than_beta_r =
1539 (v16u8) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
1540 if (!__msa_test_bz_v(is_less_than_beta_r)) {
1541 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1543 negate_tc_r, tc_r, q1_r);
1546 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1548 is_less_than_beta_l =
1549 (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1550 if (!__msa_test_bz_v(is_less_than_beta_l)) {
1551 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1553 i16_negatetc_l, tc_l, q1_l);
1557 if (!__msa_test_bz_v(is_less_than_beta)) {
1558 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1559 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1561 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1562 tc = tc + is_less_than_beta;
1566 v8i16 threshold_r, negate_thresh_r;
1567 v8i16 threshold_l, negate_thresh_l;
1568 v16i8 negate_thresh, sign_negate_thresh;
1570 negate_thresh = zero - (v16i8) tc;
1571 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1573 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1574 threshold_r, negate_thresh_r);
1577 negate_thresh_r, threshold_r, p0_r, q0_r);
1579 threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8)
tc);
1580 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1584 negate_thresh_l, threshold_l, p0_l, q0_l);
1589 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1590 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1593 v16i8 tp0, tp1, tp2, tp3;
1595 v4i32 tmp3, tmp4, tmp6, tmp7;
1596 uint32_t out0, out2;
1597 uint16_t out1, out3;
1608 out0 = __msa_copy_u_w(tmp3, 0);
1609 out1 = __msa_copy_u_h(tmp2, 0);
1610 out2 = __msa_copy_u_w(tmp3, 1);
1611 out3 = __msa_copy_u_h(tmp2, 1);
1614 SH(out1, (src + 4));
1617 SH(out3, (src + 4));
1619 out0 = __msa_copy_u_w(tmp3, 2);
1620 out1 = __msa_copy_u_h(tmp2, 2);
1621 out2 = __msa_copy_u_w(tmp3, 3);
1622 out3 = __msa_copy_u_h(tmp2, 3);
1626 SH(out1, (src + 4));
1629 SH(out3, (src + 4));
1631 out0 = __msa_copy_u_w(tmp4, 0);
1632 out1 = __msa_copy_u_h(tmp2, 4);
1633 out2 = __msa_copy_u_w(tmp4, 1);
1634 out3 = __msa_copy_u_h(tmp2, 5);
1638 SH(out1, (src + 4));
1641 SH(out3, (src + 4));
1643 out0 = __msa_copy_u_w(tmp4, 2);
1644 out1 = __msa_copy_u_h(tmp2, 6);
1645 out2 = __msa_copy_u_w(tmp4, 3);
1646 out3 = __msa_copy_u_h(tmp2, 7);
1650 SH(out1, (src + 4));
1653 SH(out3, (src + 4));
1655 out0 = __msa_copy_u_w(tmp6, 0);
1656 out1 = __msa_copy_u_h(tmp5, 0);
1657 out2 = __msa_copy_u_w(tmp6, 1);
1658 out3 = __msa_copy_u_h(tmp5, 1);
1662 SH(out1, (src + 4));
1665 SH(out3, (src + 4));
1667 out0 = __msa_copy_u_w(tmp6, 2);
1668 out1 = __msa_copy_u_h(tmp5, 2);
1669 out2 = __msa_copy_u_w(tmp6, 3);
1670 out3 = __msa_copy_u_h(tmp5, 3);
1674 SH(out1, (src + 4));
1677 SH(out3, (src + 4));
1679 out0 = __msa_copy_u_w(tmp7, 0);
1680 out1 = __msa_copy_u_h(tmp5, 4);
1681 out2 = __msa_copy_u_w(tmp7, 1);
1682 out3 = __msa_copy_u_h(tmp5, 5);
1686 SH(out1, (src + 4));
1689 SH(out3, (src + 4));
1691 out0 = __msa_copy_u_w(tmp7, 2);
1692 out1 = __msa_copy_u_h(tmp5, 6);
1693 out2 = __msa_copy_u_w(tmp7, 3);
1694 out3 = __msa_copy_u_h(tmp5, 7);
1698 SH(out1, (src + 4));
1701 SH(out3, (src + 4));
1713 uint32_t image_width)
1715 v16u8 p2_asub_p0, u8_q2asub_q0;
1716 v16u8
alpha, beta, is_less_than, is_less_than_beta;
1717 v16u8 p1, p0,
q0,
q1;
1719 v8i16 p0_r, q0_r, q1_r = { 0 };
1721 v8i16 p0_l, q0_l, q1_l = { 0 };
1722 v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1723 v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1724 v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1730 tmp_vec = (v16u8) __msa_fill_b(bs0);
1731 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1732 tmp_vec = (v16u8) __msa_fill_b(bs1);
1733 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1734 tmp_vec = (v16u8) __msa_fill_b(bs2);
1735 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1736 tmp_vec = (v16u8) __msa_fill_b(bs3);
1737 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1739 if (!__msa_test_bz_v(bs)) {
1740 tmp_vec = (v16u8) __msa_fill_b(tc0);
1741 tc = (v16i8) __msa_insve_w((v4i32)
tc, 0, (v4i32) tmp_vec);
1742 tmp_vec = (v16u8) __msa_fill_b(tc1);
1743 tc = (v16i8) __msa_insve_w((v4i32)
tc, 1, (v4i32) tmp_vec);
1744 tmp_vec = (v16u8) __msa_fill_b(tc2);
1745 tc = (v16i8) __msa_insve_w((v4i32)
tc, 2, (v4i32) tmp_vec);
1746 tmp_vec = (v16u8) __msa_fill_b(tc3);
1747 tc = (v16i8) __msa_insve_w((v4i32)
tc, 3, (v4i32) tmp_vec);
1749 alpha = (v16u8) __msa_fill_b(alpha_in);
1750 beta = (v16u8) __msa_fill_b(beta_in);
1752 LD_UB5(data - (3 * image_width), image_width,
1753 p2_org, p1_org, p0_org, q0_org, q1_org);
1756 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1757 v16u8 is_less_than_alpha, is_bs_greater_than0;
1759 is_bs_greater_than0 = ((v16u8) zero < bs);
1760 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1761 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1762 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1764 is_less_than_alpha = (p0_asub_q0 <
alpha);
1765 is_less_than_beta = (p1_asub_p0 < beta);
1766 is_less_than = is_less_than_beta & is_less_than_alpha;
1767 is_less_than_beta = (q1_asub_q0 < beta);
1768 is_less_than = is_less_than_beta & is_less_than;
1769 is_less_than = is_less_than & is_bs_greater_than0;
1772 if (!__msa_test_bz_v(is_less_than)) {
1773 v16i8 sign_negate_tc, negate_tc;
1774 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1776 q2_org =
LD_UB(data + (2 * image_width));
1777 negate_tc = zero -
tc;
1778 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1780 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1787 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1788 is_less_than_beta = (p2_asub_p0 < beta);
1789 is_less_than_beta = is_less_than_beta & is_less_than;
1791 v8u16 is_less_than_beta_r, is_less_than_beta_l;
1793 is_less_than_beta_r =
1794 (v8u16) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
1795 if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
1796 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1799 negate_tc_r, tc_r, p1_r);
1802 is_less_than_beta_l =
1803 (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1804 if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
1805 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1808 i16_negatetc_l, tc_l, p1_l);
1811 if (!__msa_test_bz_v(is_less_than_beta)) {
1812 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1813 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1814 ST_UB(p1_org, data - (2 * image_width));
1816 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1817 tc = tc + (v16i8) is_less_than_beta;
1820 u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1821 is_less_than_beta = (u8_q2asub_q0 < beta);
1822 is_less_than_beta = is_less_than_beta & is_less_than;
1825 v8u16 is_less_than_beta_r, is_less_than_beta_l;
1826 is_less_than_beta_r =
1827 (v8u16) __msa_sldi_b((v16i8) is_less_than_beta,
zero, 8);
1829 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1830 if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
1831 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1834 negate_tc_r, tc_r, q1_r);
1836 is_less_than_beta_l =
1837 (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1839 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1840 if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
1841 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1844 i16_negatetc_l, tc_l, q1_l);
1847 if (!__msa_test_bz_v(is_less_than_beta)) {
1848 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1849 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1850 ST_UB(q1_org, data + image_width);
1852 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1853 tc = tc + (v16i8) is_less_than_beta;
1856 v16i8 negate_thresh, sign_negate_thresh;
1857 v8i16 threshold_r, threshold_l;
1858 v8i16 negate_thresh_l, negate_thresh_r;
1860 negate_thresh = zero -
tc;
1861 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1863 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1864 threshold_r, negate_thresh_r);
1866 negate_thresh_r, threshold_r, p0_r, q0_r);
1868 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1869 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1872 negate_thresh_l, threshold_l, p0_l, q0_l);
1877 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1878 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1880 ST_UB(p0_org, (data - image_width));
1881 ST_UB(q0_org, data);
1891 uint32_t out0, out1, out2, out3;
1904 v8i16 src4, src5, src6, src7;
1905 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1906 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1907 v16u8 is_less_than_beta1, is_less_than_beta2;
1908 v8i16
tc, tc_orig_r, tc_plus1;
1909 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1910 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1911 v8u16 src2_r, src3_r;
1912 v8i16 p2_r, p1_r, q2_r, q1_r;
1913 v16u8 p2, q2, p0,
q0;
1915 v16i8 zeros = { 0 };
1917 alpha = (v16u8) __msa_fill_b(alpha_in);
1918 beta = (v16u8) __msa_fill_b(beta_in);
1923 load =
LD(data - 3);
1924 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1925 load =
LD(data - 3 + stride);
1926 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1933 load =
LD(data - 3);
1934 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1935 load =
LD(data - 3 + stride);
1936 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1943 load =
LD(data - 3);
1944 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1945 load =
LD(data - 3 + stride);
1946 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1953 load =
LD(data - 3);
1954 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1955 load =
LD(data - 3 + stride);
1956 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1960 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1961 src0, src1, src2, src3);
1963 ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1964 ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1966 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1967 src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1968 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1969 src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1970 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1971 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1973 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1974 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1975 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1976 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1977 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1979 is_less_than_alpha = (p0_asub_q0 <
alpha);
1980 is_less_than_beta = (p1_asub_p0 < beta);
1981 is_less_than = is_less_than_alpha & is_less_than_beta;
1982 is_less_than_beta = (q1_asub_q0 < beta);
1983 is_less_than = is_less_than_beta & is_less_than;
1985 is_less_than_beta1 = (p2_asub_p0 < beta);
1986 is_less_than_beta2 = (q2_asub_q0 < beta);
1988 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1989 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1990 p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1992 ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1996 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
2002 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
2003 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
2004 is_tc_orig1 = tc_orig;
2005 is_tc_orig2 = tc_orig;
2006 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
2009 p2_r =
CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
2010 q2_r =
CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
2017 is_tc_orig1 = (zeros < is_tc_orig1);
2018 is_tc_orig2 = is_tc_orig1;
2019 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
2020 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
2021 is_tc_orig1 = is_less_than & is_tc_orig1;
2022 is_tc_orig2 = is_less_than & is_tc_orig2;
2024 p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
2025 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
2027 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
2029 p1_sub_q1 = p1_r - q1_r;
2030 q0_sub_p0 += p1_sub_q1;
2031 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
2034 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
2035 (v16i8) is_less_than_beta1);
2036 tc = (v8i16) __msa_bmnz_v((v16u8)
tc, (v16u8) tc_plus1, is_less_than_beta1);
2038 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
2039 (v16i8) is_less_than_beta2);
2040 tc = (v8i16) __msa_bmnz_v((v16u8)
tc, (v16u8) tc_plus1, is_less_than_beta2);
2042 q0_sub_p0 =
CLIP_SH(q0_sub_p0, -tc, tc);
2044 ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
2045 src2_r += q0_sub_p0;
2046 src3_r -= q0_sub_p0;
2051 PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
2053 p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
2054 q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
2062 out0 = __msa_copy_u_w(dst0, 0);
2063 out1 = __msa_copy_u_w(dst0, 1);
2064 out2 = __msa_copy_u_w(dst0, 2);
2065 out3 = __msa_copy_u_w(dst0, 3);
2070 SW(out0, (data - 2));
2072 SW(out1, (data - 2));
2079 SW(out2, (data - 2));
2081 SW(out3, (data - 2));
2085 out0 = __msa_copy_u_w(dst1, 0);
2086 out1 = __msa_copy_u_w(dst1, 1);
2087 out2 = __msa_copy_u_w(dst1, 2);
2088 out3 = __msa_copy_u_w(dst1, 3);
2093 SW(out0, (data - 2));
2095 SW(out1, (data - 2));
2100 SW(out2, (data - 2));
2102 SW(out3, (data - 2));
2119 v16u8 p0,
q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
2121 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
2123 v16u8 p1_org, p0_org, q0_org, q1_org;
2124 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2125 v16i8 negate_tc, sign_negate_tc;
2126 v8i16 tc_r, negate_tc_r;
2129 tmp_vec = (v8i16) __msa_fill_b(bs0);
2130 bs = __msa_insve_h(bs, 0, tmp_vec);
2131 tmp_vec = (v8i16) __msa_fill_b(bs1);
2132 bs = __msa_insve_h(bs, 1, tmp_vec);
2133 tmp_vec = (v8i16) __msa_fill_b(bs2);
2134 bs = __msa_insve_h(bs, 2, tmp_vec);
2135 tmp_vec = (v8i16) __msa_fill_b(bs3);
2136 bs = __msa_insve_h(bs, 3, tmp_vec);
2138 if (!__msa_test_bz_v((v16u8) bs)) {
2139 tmp_vec = (v8i16) __msa_fill_b(tc0);
2140 tc = __msa_insve_h(tc, 0, tmp_vec);
2141 tmp_vec = (v8i16) __msa_fill_b(tc1);
2142 tc = __msa_insve_h(tc, 1, tmp_vec);
2143 tmp_vec = (v8i16) __msa_fill_b(tc2);
2144 tc = __msa_insve_h(tc, 2, tmp_vec);
2145 tmp_vec = (v8i16) __msa_fill_b(tc3);
2146 tc = __msa_insve_h(tc, 3, tmp_vec);
2148 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2150 alpha = (v16u8) __msa_fill_b(alpha_in);
2151 beta = (v16u8) __msa_fill_b(beta_in);
2153 LD_UB4(data - (img_width << 1), img_width,
2154 p1_org, p0_org, q0_org, q1_org);
2156 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2157 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2158 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2160 is_less_than_alpha = (p0_asub_q0 <
alpha);
2161 is_less_than_beta = (p1_asub_p0 < beta);
2162 is_less_than = is_less_than_beta & is_less_than_alpha;
2163 is_less_than_beta = (q1_asub_q0 < beta);
2164 is_less_than = is_less_than_beta & is_less_than;
2165 is_less_than = is_less_than & is_bs_greater_than0;
2167 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
2169 if (!__msa_test_bz_v(is_less_than)) {
2170 negate_tc = zero - (v16i8) tc;
2171 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2173 ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2175 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2176 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2178 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2183 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2184 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2186 ST_UB(q0_org, data);
2187 ST_UB(p0_org, (data - img_width));
2203 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2204 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2208 v16u8 p1_org, p0_org, q0_org, q1_org;
2209 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2210 v16u8 is_bs_greater_than0;
2211 v8i16 tc_r, negate_tc_r;
2212 v16i8 negate_tc, sign_negate_tc;
2214 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2215 v8i16 tmp1, tmp_vec, bs = { 0 };
2218 tmp_vec = (v8i16) __msa_fill_b(bs0);
2219 bs = __msa_insve_h(bs, 0, tmp_vec);
2220 tmp_vec = (v8i16) __msa_fill_b(bs1);
2221 bs = __msa_insve_h(bs, 1, tmp_vec);
2222 tmp_vec = (v8i16) __msa_fill_b(bs2);
2223 bs = __msa_insve_h(bs, 2, tmp_vec);
2224 tmp_vec = (v8i16) __msa_fill_b(bs3);
2225 bs = __msa_insve_h(bs, 3, tmp_vec);
2227 if (!__msa_test_bz_v((v16u8) bs)) {
2228 tmp_vec = (v8i16) __msa_fill_b(tc0);
2229 tc = __msa_insve_h(tc, 0, tmp_vec);
2230 tmp_vec = (v8i16) __msa_fill_b(tc1);
2231 tc = __msa_insve_h(tc, 1, tmp_vec);
2232 tmp_vec = (v8i16) __msa_fill_b(tc2);
2233 tc = __msa_insve_h(tc, 2, tmp_vec);
2234 tmp_vec = (v8i16) __msa_fill_b(tc3);
2235 tc = __msa_insve_h(tc, 3, tmp_vec);
2237 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2239 LD_UB8((data - 2), img_width,
2240 row0, row1, row2, row3, row4, row5, row6, row7);
2243 row4, row5, row6, row7,
2244 p1_org, p0_org, q0_org, q1_org);
2246 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2247 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2248 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2250 alpha = (v16u8) __msa_fill_b(alpha_in);
2251 beta = (v16u8) __msa_fill_b(beta_in);
2253 is_less_than_alpha = (p0_asub_q0 <
alpha);
2254 is_less_than_beta = (p1_asub_p0 < beta);
2255 is_less_than = is_less_than_beta & is_less_than_alpha;
2256 is_less_than_beta = (q1_asub_q0 < beta);
2257 is_less_than = is_less_than_beta & is_less_than;
2258 is_less_than = is_bs_greater_than0 & is_less_than;
2260 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
2262 if (!__msa_test_bz_v(is_less_than)) {
2263 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2264 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2266 negate_tc = zero - (v16i8) tc;
2267 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2269 ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2271 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2276 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2277 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2278 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2281 src += 4 * img_width;
2292 v16u8
alpha, beta, res;
2294 alpha = (v16u8) __msa_fill_b(alpha_in);
2295 beta = (v16u8) __msa_fill_b(beta_in);
2297 for (col = 0; col < 4; col++) {
2298 tc_val = (tc0[col] - 1) + 1;
2306 ST2x4_UB(res, 0, (src - 1), stride);
2318 v16u8
alpha, beta, res;
2320 alpha = (v16u8) __msa_fill_b(alpha_in);
2321 beta = (v16u8) __msa_fill_b(beta_in);
2323 for (col = 0; col < 4; col++) {
2324 tc_val = (tc0[col] - 1) + 1;
2333 out0 = __msa_copy_s_h((v8i16) res, 0);
2334 out1 = __msa_copy_s_h((v8i16) res, 1);
2336 SH(out0, (src - 1));
2338 SH(out1, (src - 1));
2344 int alpha,
int beta, int8_t *
tc)
2361 tc[0], tc[1], tc[2], tc[3],
2362 alpha, beta, img_width);
2366 int alpha,
int beta, int8_t *
tc)
2384 tc[0], tc[1], tc[2], tc[3],
2385 alpha, beta, img_width);
2389 int alpha,
int beta, int8_t *
tc)
2406 tc[0], tc[1], tc[2], tc[3],
2407 alpha, beta, img_width);
2411 int alpha,
int beta, int8_t *
tc)
2428 tc[0], tc[1], tc[2], tc[3],
2429 alpha, beta, img_width);
2433 int alpha,
int beta)
2437 (
unsigned int) img_width);
2441 int alpha,
int beta)
2445 (
unsigned int) img_width);
2449 int alpha,
int beta)
2453 (
unsigned int) img_width);
2457 int alpha,
int beta)
2461 (
unsigned int) img_width);
2499 int height,
int log2_denom,
2500 int weight_src,
int offset)
2506 int height,
int log2_denom,
2507 int weight_src,
int offset)
2513 int height,
int log2_denom,
2514 int weight_src,
int offset)
2521 int log2_denom,
int weight_dst,
2522 int weight_src,
int offset)
2525 weight_src, weight_dst, offset);
2530 int log2_denom,
int weight_dst,
2531 int weight_src,
int offset)
2534 weight_src, weight_dst, offset);
2539 int log2_denom,
int weight_dst,
2540 int weight_src,
int offset)
2543 weight_src, weight_dst, offset);
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
const char const char void * val
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,p1_or_q1_org_in, p2_or_q2_org_in,negate_tc_in, tc_in, p1_or_q1_out)
ptrdiff_t const GLvoid * data
void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
#define MUL2(in0, in1, in2, in3, out0, out1)
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
static const uint8_t q1[256]
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define SRA_4V(in0, in1, in2, in3, shift)
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
#define UNPCK_UB_SH(in, out0, out1)
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
#define SLDI_B4_0_SB(...)
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t image_width)
#define CLIP_SH_0_255(in)
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,p1_or_q1_org_in, p0_or_q0_out)
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
#define CLIP_SH2_0_255(in0, in1)
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static double alpha(void *priv, double x, double y)
static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
#define XORI_B4_128_SB(...)
static const uint8_t offset[127][2]
static const uint8_t q0[256]
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define SLDI_B2_0_UB(...)
#define TRANSPOSE8x4_UB_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,p1_or_q1_org_in, q1_or_p1_org_in,negate_threshold_in, threshold_in,p0_or_q0_out, q0_or_p0_out)
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
#define ST2x4_UB(in, stidx, pdst, stride)
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta)
void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,q3_or_p3_org_in, p1_or_q1_org_in,p2_or_q2_org_in, q1_or_p1_org_in,p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> in
BYTE int const BYTE int int int height
static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
#define CLIP_SH(in, min, max)
static void avc_wgt_4width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in)
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
GLint GLenum GLboolean GLsizei stride
#define LW4(psrc, stride, out0, out1, out2, out3)
static void avc_wgt_16width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
#define ST8x4_UB(in0, in1, pdst, stride)
#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
static void avc_wgt_8width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)