FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
25  int32_t log2_denom, int32_t src_weight,
26  int32_t offset_in)
27 {
28  uint32_t data0, data1;
29  v16u8 zero = { 0 };
30  v16u8 src0, src1;
31  v4i32 res0, res1;
32  v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
33  v8u16 out0, out1;
34 
35  offset_in <<= (log2_denom);
36 
37  if (log2_denom) {
38  offset_in += (1 << (log2_denom - 1));
39  }
40 
41  wgt = __msa_fill_h(src_weight);
42  offset = __msa_fill_h(offset_in);
43  denom = __msa_fill_h(log2_denom);
44 
45  data0 = LW(data);
46  data1 = LW(data + stride);
47 
48  src0 = (v16u8) __msa_fill_w(data0);
49  src1 = (v16u8) __msa_fill_w(data1);
50 
51  ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
52  MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
53  ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
54  MAXI_SH2_SH(temp0, temp1, 0);
55 
56  out0 = (v8u16) __msa_srl_h(temp0, denom);
57  out1 = (v8u16) __msa_srl_h(temp1, denom);
58 
59  SAT_UH2_UH(out0, out1, 7);
60  PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
61 
62  data0 = __msa_copy_u_w(res0, 0);
63  data1 = __msa_copy_u_w(res1, 0);
64  SW(data0, data);
65  data += stride;
66  SW(data1, data);
67 }
68 
70  int32_t height, int32_t log2_denom,
71  int32_t src_weight, int32_t offset_in)
72 {
73  uint8_t cnt;
74  uint32_t data0, data1, data2, data3;
75  v16u8 zero = { 0 };
76  v16u8 src0, src1, src2, src3;
77  v8u16 temp0, temp1, temp2, temp3, wgt;
78  v8i16 denom, offset;
79 
80  offset_in <<= (log2_denom);
81 
82  if (log2_denom) {
83  offset_in += (1 << (log2_denom - 1));
84  }
85 
86  wgt = (v8u16) __msa_fill_h(src_weight);
87  offset = __msa_fill_h(offset_in);
88  denom = __msa_fill_h(log2_denom);
89 
90  for (cnt = height / 4; cnt--;) {
91  LW4(data, stride, data0, data1, data2, data3);
92 
93  src0 = (v16u8) __msa_fill_w(data0);
94  src1 = (v16u8) __msa_fill_w(data1);
95  src2 = (v16u8) __msa_fill_w(data2);
96  src3 = (v16u8) __msa_fill_w(data3);
97 
98  ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
99  temp0, temp1, temp2, temp3);
100  MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
101  temp0, temp1, temp2, temp3);
102  ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
103  temp0, temp1, temp2, temp3);
104  MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
105  SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
106  SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
107  PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride);
108  data += (4 * stride);
109  }
110 }
111 
113  int32_t height, int32_t log2_denom,
114  int32_t src_weight, int32_t offset_in)
115 {
116  if (2 == height) {
117  avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in);
118  } else {
119  avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight,
120  offset_in);
121  }
122 }
123 
125  int32_t height, int32_t log2_denom,
126  int32_t src_weight, int32_t offset_in)
127 {
128  uint8_t cnt;
129  v16u8 zero = { 0 };
130  v16u8 src0, src1, src2, src3;
131  v8u16 src0_r, src1_r, src2_r, src3_r;
132  v8u16 temp0, temp1, temp2, temp3;
133  v8u16 wgt, denom, offset;
134  v16i8 out0, out1;
135 
136  offset_in <<= (log2_denom);
137 
138  if (log2_denom) {
139  offset_in += (1 << (log2_denom - 1));
140  }
141 
142  wgt = (v8u16) __msa_fill_h(src_weight);
143  offset = (v8u16) __msa_fill_h(offset_in);
144  denom = (v8u16) __msa_fill_h(log2_denom);
145 
146  for (cnt = height / 4; cnt--;) {
147  LD_UB4(data, stride, src0, src1, src2, src3);
148  ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
149  src0_r, src1_r, src2_r, src3_r);
150  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
151  temp0, temp1, temp2, temp3);
152  ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
153  temp0, temp1, temp2, temp3);
154  MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
155  SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
156  SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
157  PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
158  ST8x4_UB(out0, out1, data, stride);
159  data += (4 * stride);
160  }
161 }
162 
164  int32_t height, int32_t log2_denom,
165  int32_t src_weight, int32_t offset_in)
166 {
167  uint8_t cnt;
168  v16i8 zero = { 0 };
169  v16u8 src0, src1, src2, src3;
170  v16u8 dst0, dst1, dst2, dst3;
171  v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
172  v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
173  v8u16 wgt, denom, offset;
174 
175  offset_in <<= (log2_denom);
176 
177  if (log2_denom) {
178  offset_in += (1 << (log2_denom - 1));
179  }
180 
181  wgt = (v8u16) __msa_fill_h(src_weight);
182  offset = (v8u16) __msa_fill_h(offset_in);
183  denom = (v8u16) __msa_fill_h(log2_denom);
184 
185  for (cnt = height / 4; cnt--;) {
186  LD_UB4(data, stride, src0, src1, src2, src3);
187  ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
188  src0_r, src1_r, src2_r, src3_r);
189  ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
190  src0_l, src1_l, src2_l, src3_l);
191  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
192  temp0, temp1, temp2, temp3);
193  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
194  temp4, temp5, temp6, temp7);
195  ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
196  temp0, temp1, temp2, temp3);
197  ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
198  temp4, temp5, temp6, temp7);
199  MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
200  MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0);
201  SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
202  SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
203  SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
204  SAT_UH4_UH(temp4, temp5, temp6, temp7, 7);
205  PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
206  dst0, dst1, dst2, dst3);
207  ST_UB4(dst0, dst1, dst2, dst3, data, stride);
208  data += 4 * stride;
209  }
210 }
211 
212 static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
213  uint8_t *dst, int32_t dst_stride,
214  int32_t log2_denom, int32_t src_weight,
215  int32_t dst_weight, int32_t offset_in)
216 {
217  uint32_t load0, load1, out0, out1;
218  v16i8 src_wgt, dst_wgt, wgt;
219  v16i8 src0, src1, dst0, dst1;
220  v8i16 temp0, temp1, denom, offset, add_val;
221  int32_t val = 128 * (src_weight + dst_weight);
222 
223  offset_in = ((offset_in + 1) | 1) << log2_denom;
224 
225  src_wgt = __msa_fill_b(src_weight);
226  dst_wgt = __msa_fill_b(dst_weight);
227  offset = __msa_fill_h(offset_in);
228  denom = __msa_fill_h(log2_denom + 1);
229  add_val = __msa_fill_h(val);
230  offset += add_val;
231 
232  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
233 
234  load0 = LW(src);
235  src += src_stride;
236  load1 = LW(src);
237 
238  src0 = (v16i8) __msa_fill_w(load0);
239  src1 = (v16i8) __msa_fill_w(load1);
240 
241  load0 = LW(dst);
242  load1 = LW(dst + dst_stride);
243 
244  dst0 = (v16i8) __msa_fill_w(load0);
245  dst1 = (v16i8) __msa_fill_w(load1);
246 
247  XORI_B4_128_SB(src0, src1, dst0, dst1);
248  ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
249 
250  temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
251  temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
252 
253  temp0 >>= denom;
254  temp1 >>= denom;
255 
256  CLIP_SH2_0_255(temp0, temp1);
257  PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
258 
259  out0 = __msa_copy_u_w((v4i32) dst0, 0);
260  out1 = __msa_copy_u_w((v4i32) dst1, 0);
261  SW(out0, dst);
262  dst += dst_stride;
263  SW(out1, dst);
264 }
265 
266 static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
267  uint8_t *dst, int32_t dst_stride,
268  int32_t height, int32_t log2_denom,
269  int32_t src_weight, int32_t dst_weight,
270  int32_t offset_in)
271 {
272  uint8_t cnt;
273  uint32_t load0, load1, load2, load3;
274  v16i8 src_wgt, dst_wgt, wgt;
275  v16i8 src0, src1, src2, src3;
276  v16i8 dst0, dst1, dst2, dst3;
277  v8i16 temp0, temp1, temp2, temp3;
278  v8i16 denom, offset, add_val;
279  int32_t val = 128 * (src_weight + dst_weight);
280 
281  offset_in = ((offset_in + 1) | 1) << log2_denom;
282 
283  src_wgt = __msa_fill_b(src_weight);
284  dst_wgt = __msa_fill_b(dst_weight);
285  offset = __msa_fill_h(offset_in);
286  denom = __msa_fill_h(log2_denom + 1);
287  add_val = __msa_fill_h(val);
288  offset += add_val;
289 
290  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
291 
292  for (cnt = height / 4; cnt--;) {
293  LW4(src, src_stride, load0, load1, load2, load3);
294  src += (4 * src_stride);
295 
296  src0 = (v16i8) __msa_fill_w(load0);
297  src1 = (v16i8) __msa_fill_w(load1);
298  src2 = (v16i8) __msa_fill_w(load2);
299  src3 = (v16i8) __msa_fill_w(load3);
300 
301  LW4(dst, dst_stride, load0, load1, load2, load3);
302 
303  dst0 = (v16i8) __msa_fill_w(load0);
304  dst1 = (v16i8) __msa_fill_w(load1);
305  dst2 = (v16i8) __msa_fill_w(load2);
306  dst3 = (v16i8) __msa_fill_w(load3);
307 
308  XORI_B4_128_SB(src0, src1, src2, src3);
309  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
310  ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
311  temp0, temp1, temp2, temp3);
312 
313  temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
314  temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
315  temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
316  temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
317 
318  SRA_4V(temp0, temp1, temp2, temp3, denom);
319  CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
320  PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
321  dst += (4 * dst_stride);
322  }
323 }
324 
325 static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
326  uint8_t *dst, int32_t dst_stride,
327  int32_t height, int32_t log2_denom,
328  int32_t src_weight, int32_t dst_weight,
329  int32_t offset_in)
330 {
331  if (2 == height) {
332  avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
333  src_weight, dst_weight, offset_in);
334  } else {
335  avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
336  log2_denom, src_weight, dst_weight,
337  offset_in);
338  }
339 }
340 
341 static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
342  uint8_t *dst, int32_t dst_stride,
343  int32_t height, int32_t log2_denom,
344  int32_t src_weight, int32_t dst_weight,
345  int32_t offset_in)
346 {
347  uint8_t cnt;
348  v16i8 src_wgt, dst_wgt, wgt;
349  v16i8 src0, src1, src2, src3;
350  v16i8 dst0, dst1, dst2, dst3;
351  v16i8 out0, out1;
352  v8i16 temp0, temp1, temp2, temp3;
353  v8i16 denom, offset, add_val;
354  int32_t val = 128 * (src_weight + dst_weight);
355 
356  offset_in = ((offset_in + 1) | 1) << log2_denom;
357 
358  src_wgt = __msa_fill_b(src_weight);
359  dst_wgt = __msa_fill_b(dst_weight);
360  offset = __msa_fill_h(offset_in);
361  denom = __msa_fill_h(log2_denom + 1);
362  add_val = __msa_fill_h(val);
363  offset += add_val;
364 
365  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
366 
367  for (cnt = height / 4; cnt--;) {
368  LD_SB4(src, src_stride, src0, src1, src2, src3);
369  src += (4 * src_stride);
370 
371  LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
374  ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
375  temp0, temp1, temp2, temp3);
376 
377  temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
378  temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
379  temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
380  temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
381 
382  SRA_4V(temp0, temp1, temp2, temp3, denom);
383  CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
384  PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
385  ST8x4_UB(out0, out1, dst, dst_stride);
386  dst += 4 * dst_stride;
387  }
388 }
389 
390 static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
391  uint8_t *dst, int32_t dst_stride,
392  int32_t height, int32_t log2_denom,
393  int32_t src_weight, int32_t dst_weight,
394  int32_t offset_in)
395 {
396  uint8_t cnt;
397  v16i8 src_wgt, dst_wgt, wgt;
398  v16i8 src0, src1, src2, src3;
399  v16i8 dst0, dst1, dst2, dst3;
400  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
401  v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
402  v8i16 denom, offset, add_val;
403  int32_t val = 128 * (src_weight + dst_weight);
404 
405  offset_in = ((offset_in + 1) | 1) << log2_denom;
406 
407  src_wgt = __msa_fill_b(src_weight);
408  dst_wgt = __msa_fill_b(dst_weight);
409  offset = __msa_fill_h(offset_in);
410  denom = __msa_fill_h(log2_denom + 1);
411  add_val = __msa_fill_h(val);
412  offset += add_val;
413 
414  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
415 
416  for (cnt = height / 4; cnt--;) {
417  LD_SB4(src, src_stride, src0, src1, src2, src3);
418  src += (4 * src_stride);
419 
420  LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
421  XORI_B4_128_SB(src0, src1, src2, src3);
422  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
423  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
424  vec0, vec2, vec4, vec6);
425  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
426  vec1, vec3, vec5, vec7);
427 
428  temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
429  temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
430  temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
431  temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
432  temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
433  temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
434  temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
435  temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
436 
437  SRA_4V(temp0, temp1, temp2, temp3, denom);
438  SRA_4V(temp4, temp5, temp6, temp7, denom);
439  CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
440  CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
441  PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
442  dst0, dst1, dst2, dst3);
443  ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
444  dst += 4 * dst_stride;
445  }
446 }
447 
448 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
449  q3_or_p3_org_in, p1_or_q1_org_in, \
450  p2_or_q2_org_in, q1_or_p1_org_in, \
451  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
452 { \
453  v8i16 threshold; \
454  v8i16 const3 = __msa_ldi_h(3); \
455  \
456  threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
457  threshold += (p1_or_q1_org_in); \
458  \
459  (p0_or_q0_out) = threshold << 1; \
460  (p0_or_q0_out) += (p2_or_q2_org_in); \
461  (p0_or_q0_out) += (q1_or_p1_org_in); \
462  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
463  \
464  (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
465  (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
466  \
467  (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
468  (p2_or_q2_out) += (p3_or_q3_org_in); \
469  (p2_or_q2_out) += (p3_or_q3_org_in); \
470  (p2_or_q2_out) += threshold; \
471  (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
472 }
473 
474 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
475 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
476  p1_or_q1_org_in, p0_or_q0_out) \
477 { \
478  (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
479  (p0_or_q0_out) += (p1_or_q1_org_in); \
480  (p0_or_q0_out) += (p1_or_q1_org_in); \
481  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
482 }
483 
484 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
485  p1_or_q1_org_in, p2_or_q2_org_in, \
486  negate_tc_in, tc_in, p1_or_q1_out) \
487 { \
488  v8i16 clip3, temp; \
489  \
490  clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
491  (v8u16) q0_or_p0_org_in); \
492  temp = p1_or_q1_org_in << 1; \
493  clip3 = clip3 - temp; \
494  clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
495  clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \
496  p1_or_q1_out = p1_or_q1_org_in + clip3; \
497 }
498 
499 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
500  p1_or_q1_org_in, q1_or_p1_org_in, \
501  negate_threshold_in, threshold_in, \
502  p0_or_q0_out, q0_or_p0_out) \
503 { \
504  v8i16 q0_sub_p0, p1_sub_q1, delta; \
505  \
506  q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
507  p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
508  q0_sub_p0 <<= 2; \
509  p1_sub_q1 += 4; \
510  delta = q0_sub_p0 + p1_sub_q1; \
511  delta >>= 3; \
512  \
513  delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \
514  \
515  p0_or_q0_out = p0_or_q0_org_in + delta; \
516  q0_or_p0_out = q0_or_p0_org_in - delta; \
517  \
518  CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
519 }
520 
521 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
522 { \
523  uint32_t load0, load1, load2, load3; \
524  v16u8 src0 = { 0 }; \
525  v16u8 src1 = { 0 }; \
526  v16u8 src2 = { 0 }; \
527  v16u8 src3 = { 0 }; \
528  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
529  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
530  v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
531  v8i16 res0_r, res1_r; \
532  v16i8 zeros = { 0 }; \
533  v16u8 res0, res1; \
534  \
535  LW4((src - 2), stride, load0, load1, load2, load3); \
536  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
537  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
538  src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
539  src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
540  \
541  TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
542  \
543  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
544  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
545  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
546  \
547  tc = __msa_fill_h(tc_val); \
548  \
549  is_less_than_alpha = (p0_asub_q0 < alpha); \
550  is_less_than_beta = (p1_asub_p0 < beta); \
551  is_less_than = is_less_than_alpha & is_less_than_beta; \
552  is_less_than_beta = (q1_asub_q0 < beta); \
553  is_less_than = is_less_than_beta & is_less_than; \
554  \
555  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
556  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
557  \
558  q0_sub_p0 <<= 2; \
559  delta = q0_sub_p0 + p1_sub_q1; \
560  delta = __msa_srari_h(delta, 3); \
561  \
562  delta = CLIP_SH(delta, -tc, tc); \
563  \
564  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
565  \
566  res0_r += delta; \
567  res1_r -= delta; \
568  \
569  CLIP_SH2_0_255(res0_r, res1_r); \
570  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
571  \
572  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
573  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
574  \
575  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
576 }
577 
578 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
579 { \
580  v16i8 zero_m = { 0 }; \
581  \
582  out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
583  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
584  SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
585 }
586 
587 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
588 { \
589  uint32_t load0, load1; \
590  v16u8 src0 = { 0 }; \
591  v16u8 src1 = { 0 }; \
592  v16u8 src2 = { 0 }; \
593  v16u8 src3 = { 0 }; \
594  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
595  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
596  v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
597  v16i8 zeros = { 0 }; \
598  v16u8 res0, res1; \
599  \
600  load0 = LW(src - 2); \
601  load1 = LW(src - 2 + stride); \
602  \
603  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
604  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
605  \
606  TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
607  \
608  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
609  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
610  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
611  \
612  tc = __msa_fill_h(tc_val); \
613  \
614  is_less_than_alpha = (p0_asub_q0 < alpha); \
615  is_less_than_beta = (p1_asub_p0 < beta); \
616  is_less_than = is_less_than_alpha & is_less_than_beta; \
617  is_less_than_beta = (q1_asub_q0 < beta); \
618  is_less_than = is_less_than_beta & is_less_than; \
619  \
620  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
621  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
622  \
623  q0_sub_p0 <<= 2; \
624  delta = q0_sub_p0 + p1_sub_q1; \
625  delta = __msa_srari_h(delta, 3); \
626  delta = CLIP_SH(delta, -tc, tc); \
627  \
628  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
629  \
630  res0_r += delta; \
631  res1_r -= delta; \
632  \
633  CLIP_SH2_0_255(res0_r, res1_r); \
634  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
635  \
636  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
637  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
638  \
639  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
640 }
641 
643  uint8_t alpha_in,
644  uint8_t beta_in,
645  uint32_t img_width)
646 {
647  v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
648  v16u8 alpha, beta;
649  v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
650  v16u8 p2, p1, p0, q0, q1, q2;
651  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
652  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
653  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
654  v8i16 p2_r = { 0 };
655  v8i16 p1_r = { 0 };
656  v8i16 p0_r = { 0 };
657  v8i16 q0_r = { 0 };
658  v8i16 q1_r = { 0 };
659  v8i16 q2_r = { 0 };
660  v8i16 p2_l = { 0 };
661  v8i16 p1_l = { 0 };
662  v8i16 p0_l = { 0 };
663  v8i16 q0_l = { 0 };
664  v8i16 q1_l = { 0 };
665  v8i16 q2_l = { 0 };
666  v16u8 tmp_flag;
667  v16i8 zero = { 0 };
668 
669  alpha = (v16u8) __msa_fill_b(alpha_in);
670  beta = (v16u8) __msa_fill_b(beta_in);
671 
672  LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
673 
674  {
675  v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
676 
677  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
678  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
679  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
680 
681  is_less_than_alpha = (p0_asub_q0 < alpha);
682  is_less_than_beta = (p1_asub_p0 < beta);
683  is_less_than = is_less_than_beta & is_less_than_alpha;
684  is_less_than_beta = (q1_asub_q0 < beta);
685  is_less_than = is_less_than_beta & is_less_than;
686  }
687 
688  if (!__msa_test_bz_v(is_less_than)) {
689  q2_org = LD_UB(data + (2 * img_width));
690  p3_org = LD_UB(data - (img_width << 2));
691  p2_org = LD_UB(data - (3 * img_width));
692 
693  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
694  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
695  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
696 
697  tmp_flag = alpha >> 2;
698  tmp_flag = tmp_flag + 2;
699  tmp_flag = (p0_asub_q0 < tmp_flag);
700 
701  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
702  is_less_than_beta = (p2_asub_p0 < beta);
703  is_less_than_beta = is_less_than_beta & tmp_flag;
704  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
705  is_less_than_beta = is_less_than_beta & is_less_than;
706  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
707  {
708  v8u16 is_less_than_beta_l, is_less_than_beta_r;
709 
710  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
711 
712  is_less_than_beta_r =
713  (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
714  if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
715  v8i16 p3_org_r;
716 
717  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
718  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
719  p2_r, q1_org_r, p0_r, p1_r, p2_r);
720  }
721 
722  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
723 
724  is_less_than_beta_l =
725  (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
726 
727  if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
728  v8i16 p3_org_l;
729 
730  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
731  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
732  p2_l, q1_org_l, p0_l, p1_l, p2_l);
733  }
734  }
735  /* combine and store */
736  if (!__msa_test_bz_v(is_less_than_beta)) {
737  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
738 
739  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
740  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
741  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
742 
743  ST_UB(p1_org, data - (2 * img_width));
744  ST_UB(p2_org, data - (3 * img_width));
745  }
746  {
747  v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
748 
749  negate_is_less_than_beta_r =
750  (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
751  if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
752  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
753  }
754 
755  negate_is_less_than_beta_l =
756  (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
757  if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
758  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
759  }
760  }
761  /* combine */
762  if (!__msa_test_bz_v(negate_is_less_than_beta)) {
763  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
764  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
765  }
766 
767  ST_UB(p0_org, data - img_width);
768 
769  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
770  q3_org = LD_UB(data + (3 * img_width));
771  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772  is_less_than_beta = (q2_asub_q0 < beta);
773  is_less_than_beta = is_less_than_beta & tmp_flag;
774  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775  is_less_than_beta = is_less_than_beta & is_less_than;
776  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777 
778  {
779  v8u16 is_less_than_beta_l, is_less_than_beta_r;
780  is_less_than_beta_r =
781  (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
782  if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
783  v8i16 q3_org_r;
784 
785  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
786  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
787  q2_r, p1_org_r, q0_r, q1_r, q2_r);
788  }
789  is_less_than_beta_l =
790  (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
791  if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
792  v8i16 q3_org_l;
793 
794  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
795  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
796  q2_l, p1_org_l, q0_l, q1_l, q2_l);
797  }
798  }
799 
800  /* combine and store */
801  if (!__msa_test_bz_v(is_less_than_beta)) {
802  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
803  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
804  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
805  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
806 
807  ST_UB(q1_org, data + img_width);
808  ST_UB(q2_org, data + 2 * img_width);
809  }
810  {
811  v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
812  negate_is_less_than_beta_r =
813  (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
814  if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
815  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
816  }
817 
818  negate_is_less_than_beta_l =
819  (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
820  if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
821  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
822  }
823  }
824  /* combine */
825  if (!__msa_test_bz_v(negate_is_less_than_beta)) {
826  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
827  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
828  }
829  ST_UB(q0_org, data);
830  }
831 }
832 
834  uint8_t alpha_in,
835  uint8_t beta_in,
836  uint32_t img_width)
837 {
838  uint8_t *src;
839  v16u8 alpha, beta, p0_asub_q0;
840  v16u8 is_less_than_alpha, is_less_than;
841  v16u8 is_less_than_beta, negate_is_less_than_beta;
842  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
843  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
844  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
845  v8i16 p2_r = { 0 };
846  v8i16 p1_r = { 0 };
847  v8i16 p0_r = { 0 };
848  v8i16 q0_r = { 0 };
849  v8i16 q1_r = { 0 };
850  v8i16 q2_r = { 0 };
851  v8i16 p2_l = { 0 };
852  v8i16 p1_l = { 0 };
853  v8i16 p0_l = { 0 };
854  v8i16 q0_l = { 0 };
855  v8i16 q1_l = { 0 };
856  v8i16 q2_l = { 0 };
857  v16i8 zero = { 0 };
858  v16u8 tmp_flag;
859 
860  src = data - 4;
861  {
862  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
863  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
864 
865  LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
866  LD_UB8(src + (8 * img_width), img_width,
867  row8, row9, row10, row11, row12, row13, row14, row15);
868 
869  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
870  row4, row5, row6, row7,
871  row8, row9, row10, row11,
872  row12, row13, row14, row15,
873  p3_org, p2_org, p1_org, p0_org,
874  q0_org, q1_org, q2_org, q3_org);
875  }
876  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
877  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
878  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
879  UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
880 
881  /* if ( ((unsigned)ABS(p0-q0) < thresholds->alpha_in) &&
882  ((unsigned)ABS(p1-p0) < thresholds->beta_in) &&
883  ((unsigned)ABS(q1-q0) < thresholds->beta_in) ) */
884  {
885  v16u8 p1_asub_p0, q1_asub_q0;
886 
887  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
888  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
889  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
890 
891  alpha = (v16u8) __msa_fill_b(alpha_in);
892  beta = (v16u8) __msa_fill_b(beta_in);
893 
894  is_less_than_alpha = (p0_asub_q0 < alpha);
895  is_less_than_beta = (p1_asub_p0 < beta);
896  is_less_than = is_less_than_beta & is_less_than_alpha;
897  is_less_than_beta = (q1_asub_q0 < beta);
898  is_less_than = is_less_than_beta & is_less_than;
899  }
900 
901  if (!__msa_test_bz_v(is_less_than)) {
902  tmp_flag = alpha >> 2;
903  tmp_flag = tmp_flag + 2;
904  tmp_flag = (p0_asub_q0 < tmp_flag);
905 
906  {
907  v16u8 p2_asub_p0;
908 
909  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
910  is_less_than_beta = (p2_asub_p0 < beta);
911  }
912  is_less_than_beta = tmp_flag & is_less_than_beta;
913  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
914  is_less_than_beta = is_less_than_beta & is_less_than;
915  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
916 
917  /* right */
918  {
919  v16u8 is_less_than_beta_r;
920 
921  is_less_than_beta_r =
922  (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
923  if (!__msa_test_bz_v(is_less_than_beta_r)) {
924  v8i16 p3_org_r;
925 
926  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
927  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
928  p2_r, q1_org_r, p0_r, p1_r, p2_r);
929  }
930  }
931  /* left */
932  {
933  v16u8 is_less_than_beta_l;
934 
935  is_less_than_beta_l =
936  (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
937  if (!__msa_test_bz_v(is_less_than_beta_l)) {
938  v8i16 p3_org_l;
939 
940  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
941  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
942  p2_l, q1_org_l, p0_l, p1_l, p2_l);
943  }
944  }
945  /* combine and store */
946  if (!__msa_test_bz_v(is_less_than_beta)) {
947  v16u8 p0, p2, p1;
948 
949  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
950  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
951  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
952  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
953  }
954  /* right */
955  {
956  v16u8 negate_is_less_than_beta_r;
957 
958  negate_is_less_than_beta_r =
959  (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
960 
961  if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
962  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
963  }
964  }
965  /* left */
966  {
967  v16u8 negate_is_less_than_beta_l;
968 
969  negate_is_less_than_beta_l =
970  (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
971  if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
972  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
973  }
974  }
975 
976  if (!__msa_test_bz_v(negate_is_less_than_beta)) {
977  v16u8 p0;
978 
979  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
980  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
981  }
982 
983  {
984  v16u8 q2_asub_q0;
985 
986  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
987  is_less_than_beta = (q2_asub_q0 < beta);
988  }
989 
990  is_less_than_beta = is_less_than_beta & tmp_flag;
991  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
992 
993  is_less_than_beta = is_less_than_beta & is_less_than;
994  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
995 
996  /* right */
997  {
998  v16u8 is_less_than_beta_r;
999 
1000  is_less_than_beta_r =
1001  (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
1002  if (!__msa_test_bz_v(is_less_than_beta_r)) {
1003  v8i16 q3_org_r;
1004 
1005  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
1006  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
1007  q2_r, p1_org_r, q0_r, q1_r, q2_r);
1008  }
1009  }
1010  /* left */
1011  {
1012  v16u8 is_less_than_beta_l;
1013 
1014  is_less_than_beta_l =
1015  (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1016  if (!__msa_test_bz_v(is_less_than_beta_l)) {
1017  v8i16 q3_org_l;
1018 
1019  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
1020  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
1021  q2_l, p1_org_l, q0_l, q1_l, q2_l);
1022  }
1023  }
1024  /* combine and store */
1025  if (!__msa_test_bz_v(is_less_than_beta)) {
1026  v16u8 q0, q1, q2;
1027 
1028  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
1029  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
1030  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1031  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
1032  }
1033 
1034  /* right */
1035  {
1036  v16u8 negate_is_less_than_beta_r;
1037 
1038  negate_is_less_than_beta_r =
1039  (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
1040  if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
1041  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
1042  }
1043  }
1044  /* left */
1045  {
1046  v16u8 negate_is_less_than_beta_l;
1047 
1048  negate_is_less_than_beta_l =
1049  (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
1050  if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
1051  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
1052  }
1053  }
1054  if (!__msa_test_bz_v(negate_is_less_than_beta)) {
1055  v16u8 q0;
1056 
1057  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
1058  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
1059  }
1060  }
1061  {
1062  v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1063 
1064  ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
1065  ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
1066  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1067 
1068  ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
1069  ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
1070 
1071  src = data - 3;
1072  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
1073  ST2x4_UB(tmp2, 0, src + 4, img_width);
1074  src += 4 * img_width;
1075  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
1076  ST2x4_UB(tmp2, 4, src + 4, img_width);
1077  src += 4 * img_width;
1078 
1079  ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
1080  ST2x4_UB(tmp5, 0, src + 4, img_width);
1081  src += 4 * img_width;
1082  ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
1083  ST2x4_UB(tmp5, 4, src + 4, img_width);
1084  }
1085 }
1086 
1088  int32_t alpha_in,
1089  int32_t beta_in)
1090 {
1091  uint64_t load0, load1;
1092  uint32_t out0, out2;
1093  uint16_t out1, out3;
1094  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1095  v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
1096  v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
1097  v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
1098  v8i16 tmp0, tmp1, tmp2, tmp3;
1099  v16u8 alpha, beta;
1100  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1101  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1102  v16u8 is_less_than_beta1, is_less_than_beta2;
1103  v16i8 src0 = { 0 };
1104  v16i8 src1 = { 0 };
1105  v16i8 src2 = { 0 };
1106  v16i8 src3 = { 0 };
1107  v16i8 src4 = { 0 };
1108  v16i8 src5 = { 0 };
1109  v16i8 src6 = { 0 };
1110  v16i8 src7 = { 0 };
1111  v16i8 zeros = { 0 };
1112 
1113  load0 = LD(src - 4);
1114  load1 = LD(src + stride - 4);
1115  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1116  src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1117 
1118  load0 = LD(src + (2 * stride) - 4);
1119  load1 = LD(src + (3 * stride) - 4);
1120  src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1121  src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1122 
1123  load0 = LD(src + (4 * stride) - 4);
1124  load1 = LD(src + (5 * stride) - 4);
1125  src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1126  src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1127 
1128  load0 = LD(src + (6 * stride) - 4);
1129  load1 = LD(src + (7 * stride) - 4);
1130  src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1131  src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1132 
1133  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1134  src0, src1, src2, src3);
1135 
1136  ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1137  ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1138 
1139  ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1140  ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1141  SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
1142 
1143  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1144  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1145  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1146 
1147  alpha = (v16u8) __msa_fill_b(alpha_in);
1148  beta = (v16u8) __msa_fill_b(beta_in);
1149 
1150  is_less_than_alpha = (p0_asub_q0 < alpha);
1151  is_less_than_beta = (p1_asub_p0 < beta);
1152  is_less_than = is_less_than_alpha & is_less_than_beta;
1153  is_less_than_beta = (q1_asub_q0 < beta);
1154  is_less_than = is_less_than & is_less_than_beta;
1155 
1156  alpha >>= 2;
1157  alpha += 2;
1158 
1159  is_less_than_alpha = (p0_asub_q0 < alpha);
1160 
1161  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1162  is_less_than_beta1 = (p2_asub_p0 < beta);
1163  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1164  is_less_than_beta2 = (q2_asub_q0 < beta);
1165 
1166  ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1167  src0_r, src1_r, src2_r, src3_r);
1168  ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1169  src4_r, src5_r, src6_r, src7_r);
1170 
1171  dst2_x_r = src1_r + src2_r + src3_r;
1172  dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1173  dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1174  dst1_r = src0_r + src1_r + src2_r + src3_r;
1175  dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1176 
1177  dst0_r = (2 * src6_r) + (3 * src0_r);
1178  dst0_r += src1_r + src2_r + src3_r;
1179  dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1180  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1181  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1182 
1183  PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1184  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1185 
1186  dst3_x_r = src2_r + src3_r + src4_r;
1187  dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1188  dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1189  dst4_r = src2_r + src3_r + src4_r + src5_r;
1190  dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1191 
1192  dst5_r = (2 * src7_r) + (3 * src5_r);
1193  dst5_r += src4_r + src3_r + src2_r;
1194  dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1195  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1196  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1197 
1198  PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1199  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1200 
1201  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1202  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1203  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1204  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1205 
1206  PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1207 
1208  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1209  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1210  dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1211  dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1212 
1213  is_less_than = is_less_than_alpha & is_less_than;
1214  dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1215  is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1216  dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1217 
1218  dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1219  dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1220  dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1221  is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1222  dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1223  dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1224  dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1225 
1226  ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1227  dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1228  ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1229  ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1230 
1231  ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1232  SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
1233  dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1234  dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1235  SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
1236 
1237  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1238  out1 = __msa_copy_u_h((v8i16) dst0, 2);
1239  out2 = __msa_copy_u_w((v4i32) dst1, 0);
1240  out3 = __msa_copy_u_h((v8i16) dst1, 2);
1241 
1242  SW(out0, (src - 3));
1243  SH(out1, (src + 1));
1244  src += stride;
1245  SW(out2, (src - 3));
1246  SH(out3, (src + 1));
1247  src += stride;
1248 
1249  out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1250  out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1251  out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1252  out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1253 
1254  SW(out0, (src - 3));
1255  SH(out1, (src + 1));
1256  src += stride;
1257  SW(out2, (src - 3));
1258  SH(out3, (src + 1));
1259  src += stride;
1260 
1261  out0 = __msa_copy_u_w((v4i32) dst4, 0);
1262  out1 = __msa_copy_u_h((v8i16) dst4, 2);
1263  out2 = __msa_copy_u_w((v4i32) dst5, 0);
1264  out3 = __msa_copy_u_h((v8i16) dst5, 2);
1265 
1266  SW(out0, (src - 3));
1267  SH(out1, (src + 1));
1268  src += stride;
1269  SW(out2, (src - 3));
1270  SH(out3, (src + 1));
1271  src += stride;
1272 
1273  out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1274  out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1275  out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1276  out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1277 
1278  SW(out0, (src - 3));
1279  SH(out1, (src + 1));
1280  src += stride;
1281  SW(out2, (src - 3));
1282  SH(out3, (src + 1));
1283 }
1284 
1286  uint8_t alpha_in,
1287  uint8_t beta_in,
1288  uint32_t img_width)
1289 {
1290  v16u8 alpha, beta;
1291  v16u8 is_less_than;
1292  v8i16 p0_or_q0, q0_or_p0;
1293  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1294  v16i8 zero = { 0 };
1295  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1296  v16u8 is_less_than_alpha, is_less_than_beta;
1297  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1298 
1299  alpha = (v16u8) __msa_fill_b(alpha_in);
1300  beta = (v16u8) __msa_fill_b(beta_in);
1301 
1302  LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1303  p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1304 
1305  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1306  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1307  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1308 
1309  is_less_than_alpha = (p0_asub_q0 < alpha);
1310  is_less_than_beta = (p1_asub_p0 < beta);
1311  is_less_than = is_less_than_beta & is_less_than_alpha;
1312  is_less_than_beta = (q1_asub_q0 < beta);
1313  is_less_than = is_less_than_beta & is_less_than;
1314 
1315  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1316 
1317  if (!__msa_test_bz_v(is_less_than)) {
1318  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1319  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1320  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1321  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1322  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1323 
1324  p0_or_q0_org =
1325  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1326  q0_or_p0_org =
1327  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1328 
1329  ST_UB(q0_or_p0_org, data_cb_or_cr);
1330  ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1331  }
1332 }
1333 
1335  uint8_t alpha_in,
1336  uint8_t beta_in,
1337  uint32_t img_width)
1338 {
1339  v8i16 tmp1;
1340  v16u8 alpha, beta, is_less_than;
1341  v8i16 p0_or_q0, q0_or_p0;
1342  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1343  v16i8 zero = { 0 };
1344  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1345  v16u8 is_less_than_alpha, is_less_than_beta;
1346  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1347 
1348  {
1349  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1350 
1351  LD_UB8((data_cb_or_cr - 2), img_width,
1352  row0, row1, row2, row3, row4, row5, row6, row7);
1353 
1354  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1355  p1_or_q1_org, p0_or_q0_org,
1356  q0_or_p0_org, q1_or_p1_org);
1357  }
1358 
1359  alpha = (v16u8) __msa_fill_b(alpha_in);
1360  beta = (v16u8) __msa_fill_b(beta_in);
1361 
1362  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1363  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1364  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1365 
1366  is_less_than_alpha = (p0_asub_q0 < alpha);
1367  is_less_than_beta = (p1_asub_p0 < beta);
1368  is_less_than = is_less_than_beta & is_less_than_alpha;
1369  is_less_than_beta = (q1_asub_q0 < beta);
1370  is_less_than = is_less_than_beta & is_less_than;
1371  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1372 
1373  if (!__msa_test_bz_v(is_less_than)) {
1374  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1375  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1376 
1377  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1378  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1379 
1380  /* convert 16 bit output into 8 bit output */
1381  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1382 
1383  p0_or_q0_org =
1384  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1385  q0_or_p0_org =
1386  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1387  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1388 
1389  data_cb_or_cr -= 1;
1390  ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
1391  data_cb_or_cr += 4 * img_width;
1392  ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
1393  }
1394 }
1395 
1397  uint8_t bs0, uint8_t bs1,
1398  uint8_t bs2, uint8_t bs3,
1399  uint8_t tc0, uint8_t tc1,
1400  uint8_t tc2, uint8_t tc3,
1401  uint8_t alpha_in,
1402  uint8_t beta_in,
1403  uint32_t img_width)
1404 {
1405  uint8_t *src;
1406  v16u8 beta, tmp_vec, bs = { 0 };
1407  v16u8 tc = { 0 };
1408  v16u8 is_less_than, is_less_than_beta;
1409  v16u8 p1, p0, q0, q1;
1410  v8i16 p0_r, q0_r, p1_r = { 0 };
1411  v8i16 q1_r = { 0 };
1412  v8i16 p0_l, q0_l, p1_l = { 0 };
1413  v8i16 q1_l = { 0 };
1414  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1415  v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1416  v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1417  v8i16 tc_r, tc_l;
1418  v16i8 zero = { 0 };
1419  v16u8 is_bs_greater_than0;
1420 
1421  tmp_vec = (v16u8) __msa_fill_b(bs0);
1422  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1423  tmp_vec = (v16u8) __msa_fill_b(bs1);
1424  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1425  tmp_vec = (v16u8) __msa_fill_b(bs2);
1426  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1427  tmp_vec = (v16u8) __msa_fill_b(bs3);
1428  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1429 
1430  if (!__msa_test_bz_v(bs)) {
1431  tmp_vec = (v16u8) __msa_fill_b(tc0);
1432  tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1433  tmp_vec = (v16u8) __msa_fill_b(tc1);
1434  tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1435  tmp_vec = (v16u8) __msa_fill_b(tc2);
1436  tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1437  tmp_vec = (v16u8) __msa_fill_b(tc3);
1438  tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1439 
1440  is_bs_greater_than0 = (zero < bs);
1441 
1442  {
1443  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1444  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1445 
1446  src = data;
1447  src -= 4;
1448 
1449  LD_UB8(src, img_width,
1450  row0, row1, row2, row3, row4, row5, row6, row7);
1451  src += (8 * img_width);
1452  LD_UB8(src, img_width,
1453  row8, row9, row10, row11, row12, row13, row14, row15);
1454 
1455  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1456  row8, row9, row10, row11,
1457  row12, row13, row14, row15,
1458  p3_org, p2_org, p1_org, p0_org,
1459  q0_org, q1_org, q2_org, q3_org);
1460  }
1461  {
1462  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
1463  v16u8 is_less_than_alpha;
1464 
1465  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1466  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1467  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1468 
1469  alpha = (v16u8) __msa_fill_b(alpha_in);
1470  beta = (v16u8) __msa_fill_b(beta_in);
1471 
1472  is_less_than_alpha = (p0_asub_q0 < alpha);
1473  is_less_than_beta = (p1_asub_p0 < beta);
1474  is_less_than = is_less_than_beta & is_less_than_alpha;
1475  is_less_than_beta = (q1_asub_q0 < beta);
1476  is_less_than = is_less_than_beta & is_less_than;
1477  is_less_than = is_less_than & is_bs_greater_than0;
1478  }
1479  if (!__msa_test_bz_v(is_less_than)) {
1480  v16i8 negate_tc, sign_negate_tc;
1481  v8i16 negate_tc_r, i16_negatetc_l;
1482 
1483  negate_tc = zero - (v16i8) tc;
1484  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1485 
1486  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1487 
1488  UNPCK_UB_SH(tc, tc_r, tc_l);
1489  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1490  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1491  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1492 
1493  {
1494  v16u8 p2_asub_p0;
1495  v16u8 is_less_than_beta_r, is_less_than_beta_l;
1496 
1497  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1498  is_less_than_beta = (p2_asub_p0 < beta);
1499  is_less_than_beta = is_less_than_beta & is_less_than;
1500 
1501  is_less_than_beta_r =
1502  (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
1503  if (!__msa_test_bz_v(is_less_than_beta_r)) {
1504  p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1505 
1506  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1507  negate_tc_r, tc_r, p1_r);
1508  }
1509 
1510  is_less_than_beta_l =
1511  (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1512  if (!__msa_test_bz_v(is_less_than_beta_l)) {
1513  p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1514 
1515  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1516  i16_negatetc_l, tc_l, p1_l);
1517  }
1518  }
1519 
1520  if (!__msa_test_bz_v(is_less_than_beta)) {
1521  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1522  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1523 
1524  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1525  tc = tc + is_less_than_beta;
1526  }
1527 
1528  {
1529  v16u8 u8_q2asub_q0;
1530  v16u8 is_less_than_beta_l, is_less_than_beta_r;
1531 
1532  u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1533  is_less_than_beta = (u8_q2asub_q0 < beta);
1534  is_less_than_beta = is_less_than_beta & is_less_than;
1535 
1536  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1537 
1538  is_less_than_beta_r =
1539  (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
1540  if (!__msa_test_bz_v(is_less_than_beta_r)) {
1541  q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1542  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1543  negate_tc_r, tc_r, q1_r);
1544  }
1545 
1546  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1547 
1548  is_less_than_beta_l =
1549  (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1550  if (!__msa_test_bz_v(is_less_than_beta_l)) {
1551  q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1552  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1553  i16_negatetc_l, tc_l, q1_l);
1554  }
1555  }
1556 
1557  if (!__msa_test_bz_v(is_less_than_beta)) {
1558  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1559  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1560 
1561  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1562  tc = tc + is_less_than_beta;
1563  }
1564 
1565  {
1566  v8i16 threshold_r, negate_thresh_r;
1567  v8i16 threshold_l, negate_thresh_l;
1568  v16i8 negate_thresh, sign_negate_thresh;
1569 
1570  negate_thresh = zero - (v16i8) tc;
1571  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1572 
1573  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1574  threshold_r, negate_thresh_r);
1575 
1576  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1577  negate_thresh_r, threshold_r, p0_r, q0_r);
1578 
1579  threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1580  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1581  negate_thresh);
1582 
1583  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1584  negate_thresh_l, threshold_l, p0_l, q0_l);
1585  }
1586 
1587  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1588 
1589  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1590  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1591  }
1592  {
1593  v16i8 tp0, tp1, tp2, tp3;
1594  v8i16 tmp2, tmp5;
1595  v4i32 tmp3, tmp4, tmp6, tmp7;
1596  uint32_t out0, out2;
1597  uint16_t out1, out3;
1598 
1599  src = data - 3;
1600 
1601  ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1602  ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1603  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1604 
1605  ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1606  ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1607 
1608  out0 = __msa_copy_u_w(tmp3, 0);
1609  out1 = __msa_copy_u_h(tmp2, 0);
1610  out2 = __msa_copy_u_w(tmp3, 1);
1611  out3 = __msa_copy_u_h(tmp2, 1);
1612 
1613  SW(out0, src);
1614  SH(out1, (src + 4));
1615  src += img_width;
1616  SW(out2, src);
1617  SH(out3, (src + 4));
1618 
1619  out0 = __msa_copy_u_w(tmp3, 2);
1620  out1 = __msa_copy_u_h(tmp2, 2);
1621  out2 = __msa_copy_u_w(tmp3, 3);
1622  out3 = __msa_copy_u_h(tmp2, 3);
1623 
1624  src += img_width;
1625  SW(out0, src);
1626  SH(out1, (src + 4));
1627  src += img_width;
1628  SW(out2, src);
1629  SH(out3, (src + 4));
1630 
1631  out0 = __msa_copy_u_w(tmp4, 0);
1632  out1 = __msa_copy_u_h(tmp2, 4);
1633  out2 = __msa_copy_u_w(tmp4, 1);
1634  out3 = __msa_copy_u_h(tmp2, 5);
1635 
1636  src += img_width;
1637  SW(out0, src);
1638  SH(out1, (src + 4));
1639  src += img_width;
1640  SW(out2, src);
1641  SH(out3, (src + 4));
1642 
1643  out0 = __msa_copy_u_w(tmp4, 2);
1644  out1 = __msa_copy_u_h(tmp2, 6);
1645  out2 = __msa_copy_u_w(tmp4, 3);
1646  out3 = __msa_copy_u_h(tmp2, 7);
1647 
1648  src += img_width;
1649  SW(out0, src);
1650  SH(out1, (src + 4));
1651  src += img_width;
1652  SW(out2, src);
1653  SH(out3, (src + 4));
1654 
1655  out0 = __msa_copy_u_w(tmp6, 0);
1656  out1 = __msa_copy_u_h(tmp5, 0);
1657  out2 = __msa_copy_u_w(tmp6, 1);
1658  out3 = __msa_copy_u_h(tmp5, 1);
1659 
1660  src += img_width;
1661  SW(out0, src);
1662  SH(out1, (src + 4));
1663  src += img_width;
1664  SW(out2, src);
1665  SH(out3, (src + 4));
1666 
1667  out0 = __msa_copy_u_w(tmp6, 2);
1668  out1 = __msa_copy_u_h(tmp5, 2);
1669  out2 = __msa_copy_u_w(tmp6, 3);
1670  out3 = __msa_copy_u_h(tmp5, 3);
1671 
1672  src += img_width;
1673  SW(out0, src);
1674  SH(out1, (src + 4));
1675  src += img_width;
1676  SW(out2, src);
1677  SH(out3, (src + 4));
1678 
1679  out0 = __msa_copy_u_w(tmp7, 0);
1680  out1 = __msa_copy_u_h(tmp5, 4);
1681  out2 = __msa_copy_u_w(tmp7, 1);
1682  out3 = __msa_copy_u_h(tmp5, 5);
1683 
1684  src += img_width;
1685  SW(out0, src);
1686  SH(out1, (src + 4));
1687  src += img_width;
1688  SW(out2, src);
1689  SH(out3, (src + 4));
1690 
1691  out0 = __msa_copy_u_w(tmp7, 2);
1692  out1 = __msa_copy_u_h(tmp5, 6);
1693  out2 = __msa_copy_u_w(tmp7, 3);
1694  out3 = __msa_copy_u_h(tmp5, 7);
1695 
1696  src += img_width;
1697  SW(out0, src);
1698  SH(out1, (src + 4));
1699  src += img_width;
1700  SW(out2, src);
1701  SH(out3, (src + 4));
1702  }
1703  }
1704 }
1705 
1707  uint8_t bs0, uint8_t bs1,
1708  uint8_t bs2, uint8_t bs3,
1709  uint8_t tc0, uint8_t tc1,
1710  uint8_t tc2, uint8_t tc3,
1711  uint8_t alpha_in,
1712  uint8_t beta_in,
1713  uint32_t image_width)
1714 {
1715  v16u8 p2_asub_p0, u8_q2asub_q0;
1716  v16u8 alpha, beta, is_less_than, is_less_than_beta;
1717  v16u8 p1, p0, q0, q1;
1718  v8i16 p1_r = { 0 };
1719  v8i16 p0_r, q0_r, q1_r = { 0 };
1720  v8i16 p1_l = { 0 };
1721  v8i16 p0_l, q0_l, q1_l = { 0 };
1722  v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1723  v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1724  v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1725  v16i8 zero = { 0 };
1726  v16u8 tmp_vec;
1727  v16u8 bs = { 0 };
1728  v16i8 tc = { 0 };
1729 
1730  tmp_vec = (v16u8) __msa_fill_b(bs0);
1731  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1732  tmp_vec = (v16u8) __msa_fill_b(bs1);
1733  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1734  tmp_vec = (v16u8) __msa_fill_b(bs2);
1735  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1736  tmp_vec = (v16u8) __msa_fill_b(bs3);
1737  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1738 
1739  if (!__msa_test_bz_v(bs)) {
1740  tmp_vec = (v16u8) __msa_fill_b(tc0);
1741  tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1742  tmp_vec = (v16u8) __msa_fill_b(tc1);
1743  tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1744  tmp_vec = (v16u8) __msa_fill_b(tc2);
1745  tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1746  tmp_vec = (v16u8) __msa_fill_b(tc3);
1747  tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1748 
1749  alpha = (v16u8) __msa_fill_b(alpha_in);
1750  beta = (v16u8) __msa_fill_b(beta_in);
1751 
1752  LD_UB5(data - (3 * image_width), image_width,
1753  p2_org, p1_org, p0_org, q0_org, q1_org);
1754 
1755  {
1756  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1757  v16u8 is_less_than_alpha, is_bs_greater_than0;
1758 
1759  is_bs_greater_than0 = ((v16u8) zero < bs);
1760  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1761  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1762  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1763 
1764  is_less_than_alpha = (p0_asub_q0 < alpha);
1765  is_less_than_beta = (p1_asub_p0 < beta);
1766  is_less_than = is_less_than_beta & is_less_than_alpha;
1767  is_less_than_beta = (q1_asub_q0 < beta);
1768  is_less_than = is_less_than_beta & is_less_than;
1769  is_less_than = is_less_than & is_bs_greater_than0;
1770  }
1771 
1772  if (!__msa_test_bz_v(is_less_than)) {
1773  v16i8 sign_negate_tc, negate_tc;
1774  v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1775 
1776  q2_org = LD_UB(data + (2 * image_width));
1777  negate_tc = zero - tc;
1778  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1779 
1780  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1781 
1782  UNPCK_UB_SH(tc, tc_r, tc_l);
1783  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1784  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1785  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1786 
1787  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1788  is_less_than_beta = (p2_asub_p0 < beta);
1789  is_less_than_beta = is_less_than_beta & is_less_than;
1790  {
1791  v8u16 is_less_than_beta_r, is_less_than_beta_l;
1792 
1793  is_less_than_beta_r =
1794  (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
1795  if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
1796  p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1797 
1798  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1799  negate_tc_r, tc_r, p1_r);
1800  }
1801 
1802  is_less_than_beta_l =
1803  (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1804  if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
1805  p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1806 
1807  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1808  i16_negatetc_l, tc_l, p1_l);
1809  }
1810  }
1811  if (!__msa_test_bz_v(is_less_than_beta)) {
1812  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1813  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1814  ST_UB(p1_org, data - (2 * image_width));
1815 
1816  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1817  tc = tc + (v16i8) is_less_than_beta;
1818  }
1819 
1820  u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1821  is_less_than_beta = (u8_q2asub_q0 < beta);
1822  is_less_than_beta = is_less_than_beta & is_less_than;
1823 
1824  {
1825  v8u16 is_less_than_beta_r, is_less_than_beta_l;
1826  is_less_than_beta_r =
1827  (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
1828 
1829  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1830  if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
1831  q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1832 
1833  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1834  negate_tc_r, tc_r, q1_r);
1835  }
1836  is_less_than_beta_l =
1837  (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
1838 
1839  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1840  if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
1841  q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1842 
1843  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1844  i16_negatetc_l, tc_l, q1_l);
1845  }
1846  }
1847  if (!__msa_test_bz_v(is_less_than_beta)) {
1848  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1849  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1850  ST_UB(q1_org, data + image_width);
1851 
1852  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1853  tc = tc + (v16i8) is_less_than_beta;
1854  }
1855  {
1856  v16i8 negate_thresh, sign_negate_thresh;
1857  v8i16 threshold_r, threshold_l;
1858  v8i16 negate_thresh_l, negate_thresh_r;
1859 
1860  negate_thresh = zero - tc;
1861  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1862 
1863  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1864  threshold_r, negate_thresh_r);
1865  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1866  negate_thresh_r, threshold_r, p0_r, q0_r);
1867 
1868  threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1869  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1870  negate_thresh);
1871  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1872  negate_thresh_l, threshold_l, p0_l, q0_l);
1873  }
1874 
1875  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1876 
1877  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1878  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1879 
1880  ST_UB(p0_org, (data - image_width));
1881  ST_UB(q0_org, data);
1882  }
1883  }
1884 }
1885 
1887  int32_t alpha_in, int32_t beta_in,
1888  int8_t *tc0)
1889 {
1890  uint8_t *data = in;
1891  uint32_t out0, out1, out2, out3;
1892  uint64_t load;
1893  uint32_t tc_val;
1894  v16u8 alpha, beta;
1895  v16i8 inp0 = { 0 };
1896  v16i8 inp1 = { 0 };
1897  v16i8 inp2 = { 0 };
1898  v16i8 inp3 = { 0 };
1899  v16i8 inp4 = { 0 };
1900  v16i8 inp5 = { 0 };
1901  v16i8 inp6 = { 0 };
1902  v16i8 inp7 = { 0 };
1903  v16i8 src0, src1, src2, src3;
1904  v8i16 src4, src5, src6, src7;
1905  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1906  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1907  v16u8 is_less_than_beta1, is_less_than_beta2;
1908  v8i16 tc, tc_orig_r, tc_plus1;
1909  v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1910  v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1911  v8u16 src2_r, src3_r;
1912  v8i16 p2_r, p1_r, q2_r, q1_r;
1913  v16u8 p2, q2, p0, q0;
1914  v4i32 dst0, dst1;
1915  v16i8 zeros = { 0 };
1916 
1917  alpha = (v16u8) __msa_fill_b(alpha_in);
1918  beta = (v16u8) __msa_fill_b(beta_in);
1919 
1920  if (tc0[0] < 0) {
1921  data += (2 * stride);
1922  } else {
1923  load = LD(data - 3);
1924  inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1925  load = LD(data - 3 + stride);
1926  inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1927  data += (2 * stride);
1928  }
1929 
1930  if (tc0[1] < 0) {
1931  data += (2 * stride);
1932  } else {
1933  load = LD(data - 3);
1934  inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1935  load = LD(data - 3 + stride);
1936  inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1937  data += (2 * stride);
1938  }
1939 
1940  if (tc0[2] < 0) {
1941  data += (2 * stride);
1942  } else {
1943  load = LD(data - 3);
1944  inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1945  load = LD(data - 3 + stride);
1946  inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1947  data += (2 * stride);
1948  }
1949 
1950  if (tc0[3] < 0) {
1951  data += (2 * stride);
1952  } else {
1953  load = LD(data - 3);
1954  inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1955  load = LD(data - 3 + stride);
1956  inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1957  data += (2 * stride);
1958  }
1959 
1960  ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1961  src0, src1, src2, src3);
1962 
1963  ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1964  ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1965 
1966  src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1967  src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1968  src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1969  src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1970  src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1971  src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1972 
1973  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1974  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1975  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1976  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1977  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1978 
1979  is_less_than_alpha = (p0_asub_q0 < alpha);
1980  is_less_than_beta = (p1_asub_p0 < beta);
1981  is_less_than = is_less_than_alpha & is_less_than_beta;
1982  is_less_than_beta = (q1_asub_q0 < beta);
1983  is_less_than = is_less_than_beta & is_less_than;
1984 
1985  is_less_than_beta1 = (p2_asub_p0 < beta);
1986  is_less_than_beta2 = (q2_asub_q0 < beta);
1987 
1988  p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1989  p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1990  p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1991 
1992  ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1993  p2_r += p0_add_q0;
1994  p2_r >>= 1;
1995  p2_r -= p1_r;
1996  ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1997  q2_r += p0_add_q0;
1998  q2_r >>= 1;
1999  q2_r -= q1_r;
2000 
2001  tc_val = LW(tc0);
2002  tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
2003  tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
2004  is_tc_orig1 = tc_orig;
2005  is_tc_orig2 = tc_orig;
2006  tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
2007  tc = tc_orig_r;
2008 
2009  p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
2010  q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
2011 
2012  p2_r += p1_r;
2013  q2_r += q1_r;
2014 
2015  PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
2016 
2017  is_tc_orig1 = (zeros < is_tc_orig1);
2018  is_tc_orig2 = is_tc_orig1;
2019  is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
2020  is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
2021  is_tc_orig1 = is_less_than & is_tc_orig1;
2022  is_tc_orig2 = is_less_than & is_tc_orig2;
2023 
2024  p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
2025  q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
2026 
2027  q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
2028  q0_sub_p0 <<= 2;
2029  p1_sub_q1 = p1_r - q1_r;
2030  q0_sub_p0 += p1_sub_q1;
2031  q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
2032 
2033  tc_plus1 = tc + 1;
2034  is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
2035  (v16i8) is_less_than_beta1);
2036  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
2037  tc_plus1 = tc + 1;
2038  is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
2039  (v16i8) is_less_than_beta2);
2040  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
2041 
2042  q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
2043 
2044  ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
2045  src2_r += q0_sub_p0;
2046  src3_r -= q0_sub_p0;
2047 
2048  src2_r = (v8u16) CLIP_SH_0_255(src2_r);
2049  src3_r = (v8u16) CLIP_SH_0_255(src3_r);
2050 
2051  PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
2052 
2053  p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
2054  q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
2055 
2056  ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
2057 
2058  ILVRL_H2_SW(q2, p2, dst0, dst1);
2059 
2060  data = in;
2061 
2062  out0 = __msa_copy_u_w(dst0, 0);
2063  out1 = __msa_copy_u_w(dst0, 1);
2064  out2 = __msa_copy_u_w(dst0, 2);
2065  out3 = __msa_copy_u_w(dst0, 3);
2066 
2067  if (tc0[0] < 0) {
2068  data += (2 * stride);
2069  } else {
2070  SW(out0, (data - 2));
2071  data += stride;
2072  SW(out1, (data - 2));
2073  data += stride;
2074  }
2075 
2076  if (tc0[1] < 0) {
2077  data += (2 * stride);
2078  } else {
2079  SW(out2, (data - 2));
2080  data += stride;
2081  SW(out3, (data - 2));
2082  data += stride;
2083  }
2084 
2085  out0 = __msa_copy_u_w(dst1, 0);
2086  out1 = __msa_copy_u_w(dst1, 1);
2087  out2 = __msa_copy_u_w(dst1, 2);
2088  out3 = __msa_copy_u_w(dst1, 3);
2089 
2090  if (tc0[2] < 0) {
2091  data += (2 * stride);
2092  } else {
2093  SW(out0, (data - 2));
2094  data += stride;
2095  SW(out1, (data - 2));
2096  data += stride;
2097  }
2098 
2099  if (tc0[3] >= 0) {
2100  SW(out2, (data - 2));
2101  data += stride;
2102  SW(out3, (data - 2));
2103  }
2104 }
2105 
2107  uint8_t bs0, uint8_t bs1,
2108  uint8_t bs2, uint8_t bs3,
2109  uint8_t tc0, uint8_t tc1,
2110  uint8_t tc2, uint8_t tc3,
2111  uint8_t alpha_in,
2112  uint8_t beta_in,
2113  uint32_t img_width)
2114 {
2115  v16u8 alpha, beta;
2116  v8i16 tmp_vec;
2117  v8i16 bs = { 0 };
2118  v8i16 tc = { 0 };
2119  v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
2120  v16u8 is_less_than;
2121  v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
2122  v8i16 p0_r, q0_r;
2123  v16u8 p1_org, p0_org, q0_org, q1_org;
2124  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2125  v16i8 negate_tc, sign_negate_tc;
2126  v8i16 tc_r, negate_tc_r;
2127  v16i8 zero = { 0 };
2128 
2129  tmp_vec = (v8i16) __msa_fill_b(bs0);
2130  bs = __msa_insve_h(bs, 0, tmp_vec);
2131  tmp_vec = (v8i16) __msa_fill_b(bs1);
2132  bs = __msa_insve_h(bs, 1, tmp_vec);
2133  tmp_vec = (v8i16) __msa_fill_b(bs2);
2134  bs = __msa_insve_h(bs, 2, tmp_vec);
2135  tmp_vec = (v8i16) __msa_fill_b(bs3);
2136  bs = __msa_insve_h(bs, 3, tmp_vec);
2137 
2138  if (!__msa_test_bz_v((v16u8) bs)) {
2139  tmp_vec = (v8i16) __msa_fill_b(tc0);
2140  tc = __msa_insve_h(tc, 0, tmp_vec);
2141  tmp_vec = (v8i16) __msa_fill_b(tc1);
2142  tc = __msa_insve_h(tc, 1, tmp_vec);
2143  tmp_vec = (v8i16) __msa_fill_b(tc2);
2144  tc = __msa_insve_h(tc, 2, tmp_vec);
2145  tmp_vec = (v8i16) __msa_fill_b(tc3);
2146  tc = __msa_insve_h(tc, 3, tmp_vec);
2147 
2148  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2149 
2150  alpha = (v16u8) __msa_fill_b(alpha_in);
2151  beta = (v16u8) __msa_fill_b(beta_in);
2152 
2153  LD_UB4(data - (img_width << 1), img_width,
2154  p1_org, p0_org, q0_org, q1_org);
2155 
2156  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2157  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2158  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2159 
2160  is_less_than_alpha = (p0_asub_q0 < alpha);
2161  is_less_than_beta = (p1_asub_p0 < beta);
2162  is_less_than = is_less_than_beta & is_less_than_alpha;
2163  is_less_than_beta = (q1_asub_q0 < beta);
2164  is_less_than = is_less_than_beta & is_less_than;
2165  is_less_than = is_less_than & is_bs_greater_than0;
2166 
2167  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2168 
2169  if (!__msa_test_bz_v(is_less_than)) {
2170  negate_tc = zero - (v16i8) tc;
2171  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2172 
2173  ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2174 
2175  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2176  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2177 
2178  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2179  tc_r, p0_r, q0_r);
2180 
2181  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2182 
2183  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2184  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2185 
2186  ST_UB(q0_org, data);
2187  ST_UB(p0_org, (data - img_width));
2188  }
2189  }
2190 }
2191 
2193  uint8_t bs0, uint8_t bs1,
2194  uint8_t bs2, uint8_t bs3,
2195  uint8_t tc0, uint8_t tc1,
2196  uint8_t tc2, uint8_t tc3,
2197  uint8_t alpha_in,
2198  uint8_t beta_in,
2199  uint32_t img_width)
2200 {
2201  uint8_t *src;
2202  v16u8 alpha, beta;
2203  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2204  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2205  v16u8 p0, q0;
2206  v8i16 p0_r = { 0 };
2207  v8i16 q0_r = { 0 };
2208  v16u8 p1_org, p0_org, q0_org, q1_org;
2209  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2210  v16u8 is_bs_greater_than0;
2211  v8i16 tc_r, negate_tc_r;
2212  v16i8 negate_tc, sign_negate_tc;
2213  v16i8 zero = { 0 };
2214  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2215  v8i16 tmp1, tmp_vec, bs = { 0 };
2216  v8i16 tc = { 0 };
2217 
2218  tmp_vec = (v8i16) __msa_fill_b(bs0);
2219  bs = __msa_insve_h(bs, 0, tmp_vec);
2220  tmp_vec = (v8i16) __msa_fill_b(bs1);
2221  bs = __msa_insve_h(bs, 1, tmp_vec);
2222  tmp_vec = (v8i16) __msa_fill_b(bs2);
2223  bs = __msa_insve_h(bs, 2, tmp_vec);
2224  tmp_vec = (v8i16) __msa_fill_b(bs3);
2225  bs = __msa_insve_h(bs, 3, tmp_vec);
2226 
2227  if (!__msa_test_bz_v((v16u8) bs)) {
2228  tmp_vec = (v8i16) __msa_fill_b(tc0);
2229  tc = __msa_insve_h(tc, 0, tmp_vec);
2230  tmp_vec = (v8i16) __msa_fill_b(tc1);
2231  tc = __msa_insve_h(tc, 1, tmp_vec);
2232  tmp_vec = (v8i16) __msa_fill_b(tc2);
2233  tc = __msa_insve_h(tc, 2, tmp_vec);
2234  tmp_vec = (v8i16) __msa_fill_b(tc3);
2235  tc = __msa_insve_h(tc, 3, tmp_vec);
2236 
2237  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2238 
2239  LD_UB8((data - 2), img_width,
2240  row0, row1, row2, row3, row4, row5, row6, row7);
2241 
2242  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2243  row4, row5, row6, row7,
2244  p1_org, p0_org, q0_org, q1_org);
2245 
2246  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2247  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2248  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2249 
2250  alpha = (v16u8) __msa_fill_b(alpha_in);
2251  beta = (v16u8) __msa_fill_b(beta_in);
2252 
2253  is_less_than_alpha = (p0_asub_q0 < alpha);
2254  is_less_than_beta = (p1_asub_p0 < beta);
2255  is_less_than = is_less_than_beta & is_less_than_alpha;
2256  is_less_than_beta = (q1_asub_q0 < beta);
2257  is_less_than = is_less_than_beta & is_less_than;
2258  is_less_than = is_bs_greater_than0 & is_less_than;
2259 
2260  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2261 
2262  if (!__msa_test_bz_v(is_less_than)) {
2263  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2264  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2265 
2266  negate_tc = zero - (v16i8) tc;
2267  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2268 
2269  ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2270 
2271  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2272  tc_r, p0_r, q0_r);
2273 
2274  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2275 
2276  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2277  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2278  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2279  src = data - 1;
2280  ST2x4_UB(tmp1, 0, src, img_width);
2281  src += 4 * img_width;
2282  ST2x4_UB(tmp1, 4, src, img_width);
2283  }
2284  }
2285 }
2286 
2288  int32_t alpha_in, int32_t beta_in,
2289  int8_t *tc0)
2290 {
2291  int32_t col, tc_val;
2292  v16u8 alpha, beta, res;
2293 
2294  alpha = (v16u8) __msa_fill_b(alpha_in);
2295  beta = (v16u8) __msa_fill_b(beta_in);
2296 
2297  for (col = 0; col < 4; col++) {
2298  tc_val = (tc0[col] - 1) + 1;
2299 
2300  if (tc_val <= 0) {
2301  src += (4 * stride);
2302  continue;
2303  }
2304 
2305  AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2306  ST2x4_UB(res, 0, (src - 1), stride);
2307  src += (4 * stride);
2308  }
2309 }
2310 
2312  int32_t alpha_in,
2313  int32_t beta_in,
2314  int8_t *tc0)
2315 {
2316  int32_t col, tc_val;
2317  int16_t out0, out1;
2318  v16u8 alpha, beta, res;
2319 
2320  alpha = (v16u8) __msa_fill_b(alpha_in);
2321  beta = (v16u8) __msa_fill_b(beta_in);
2322 
2323  for (col = 0; col < 4; col++) {
2324  tc_val = (tc0[col] - 1) + 1;
2325 
2326  if (tc_val <= 0) {
2327  src += 4 * stride;
2328  continue;
2329  }
2330 
2331  AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2332 
2333  out0 = __msa_copy_s_h((v8i16) res, 0);
2334  out1 = __msa_copy_s_h((v8i16) res, 1);
2335 
2336  SH(out0, (src - 1));
2337  src += stride;
2338  SH(out1, (src - 1));
2339  src += stride;
2340  }
2341 }
2342 
2344  int alpha, int beta, int8_t *tc)
2345 {
2346  uint8_t bs0 = 1;
2347  uint8_t bs1 = 1;
2348  uint8_t bs2 = 1;
2349  uint8_t bs3 = 1;
2350 
2351  if (tc[0] < 0)
2352  bs0 = 0;
2353  if (tc[1] < 0)
2354  bs1 = 0;
2355  if (tc[2] < 0)
2356  bs2 = 0;
2357  if (tc[3] < 0)
2358  bs3 = 0;
2359 
2360  avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2361  tc[0], tc[1], tc[2], tc[3],
2362  alpha, beta, img_width);
2363 }
2364 
2366  int alpha, int beta, int8_t *tc)
2367 {
2368 
2369  uint8_t bs0 = 1;
2370  uint8_t bs1 = 1;
2371  uint8_t bs2 = 1;
2372  uint8_t bs3 = 1;
2373 
2374  if (tc[0] < 0)
2375  bs0 = 0;
2376  if (tc[1] < 0)
2377  bs1 = 0;
2378  if (tc[2] < 0)
2379  bs2 = 0;
2380  if (tc[3] < 0)
2381  bs3 = 0;
2382 
2383  avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2384  tc[0], tc[1], tc[2], tc[3],
2385  alpha, beta, img_width);
2386 }
2387 
2389  int alpha, int beta, int8_t *tc)
2390 {
2391  uint8_t bs0 = 1;
2392  uint8_t bs1 = 1;
2393  uint8_t bs2 = 1;
2394  uint8_t bs3 = 1;
2395 
2396  if (tc[0] < 0)
2397  bs0 = 0;
2398  if (tc[1] < 0)
2399  bs1 = 0;
2400  if (tc[2] < 0)
2401  bs2 = 0;
2402  if (tc[3] < 0)
2403  bs3 = 0;
2404 
2405  avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2406  tc[0], tc[1], tc[2], tc[3],
2407  alpha, beta, img_width);
2408 }
2409 
2411  int alpha, int beta, int8_t *tc)
2412 {
2413  uint8_t bs0 = 1;
2414  uint8_t bs1 = 1;
2415  uint8_t bs2 = 1;
2416  uint8_t bs3 = 1;
2417 
2418  if (tc[0] < 0)
2419  bs0 = 0;
2420  if (tc[1] < 0)
2421  bs1 = 0;
2422  if (tc[2] < 0)
2423  bs2 = 0;
2424  if (tc[3] < 0)
2425  bs3 = 0;
2426 
2427  avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2428  tc[0], tc[1], tc[2], tc[3],
2429  alpha, beta, img_width);
2430 }
2431 
2433  int alpha, int beta)
2434 {
2436  (uint8_t) beta,
2437  (unsigned int) img_width);
2438 }
2439 
2441  int alpha, int beta)
2442 {
2444  (uint8_t) beta,
2445  (unsigned int) img_width);
2446 }
2447 
2449  int alpha, int beta)
2450 {
2452  (uint8_t) beta,
2453  (unsigned int) img_width);
2454 }
2455 
2457  int alpha, int beta)
2458 {
2460  (uint8_t) beta,
2461  (unsigned int) img_width);
2462 }
2463 
2465  int32_t ystride,
2466  int32_t alpha, int32_t beta,
2467  int8_t *tc0)
2468 {
2469  avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2470 }
2471 
2473  int32_t ystride,
2474  int32_t alpha,
2475  int32_t beta,
2476  int8_t *tc0)
2477 {
2478  avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2479 }
2480 
2482  int32_t ystride,
2483  int32_t alpha,
2484  int32_t beta,
2485  int8_t *tc0)
2486 {
2487  avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2488 }
2489 
2491  int32_t ystride,
2492  int32_t alpha,
2493  int32_t beta)
2494 {
2495  avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2496 }
2497 
2499  int height, int log2_denom,
2500  int weight_src, int offset)
2501 {
2502  avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset);
2503 }
2504 
2506  int height, int log2_denom,
2507  int weight_src, int offset)
2508 {
2509  avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset);
2510 }
2511 
2513  int height, int log2_denom,
2514  int weight_src, int offset)
2515 {
2516  avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset);
2517 }
2518 
2520  int stride, int height,
2521  int log2_denom, int weight_dst,
2522  int weight_src, int offset)
2523 {
2524  avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
2525  weight_src, weight_dst, offset);
2526 }
2527 
2529  int stride, int height,
2530  int log2_denom, int weight_dst,
2531  int weight_src, int offset)
2532 {
2533  avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
2534  weight_src, weight_dst, offset);
2535 }
2536 
2538  int stride, int height,
2539  int log2_denom, int weight_dst,
2540  int weight_src, int offset)
2541 {
2542  avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
2543  weight_src, weight_dst, offset);
2544 }
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2519
#define MAXI_SH2_SH(...)
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2481
const char const char void * val
Definition: avisynth_c.h:634
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2537
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1334
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:2192
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,p1_or_q1_org_in, p2_or_q2_org_in,negate_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_msa.c:484
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:101
void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2505
#define ILVRL_B2_SH(...)
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2311
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1886
#define PCKEV_B2_SH(...)
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1285
#define LW(psrc)
#define ILVL_B4_UH(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
#define tc
Definition: regdef.h:69
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2388
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:587
#define ILVL_H2_SH(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define SRA_4V(in0, in1, in2, in3, shift)
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2432
static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:212
uint8_t
#define MAXI_SH4_UH(...)
static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:390
#define UNPCK_UB_SH(in, out0, out1)
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2410
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2440
#define SLDI_B4_0_SB(...)
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t image_width)
Definition: h264dsp_msa.c:1706
#define CLIP_SH_0_255(in)
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_msa.c:475
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:833
#define ILVRL_H2_SH(...)
#define LD_UB5(...)
#define CLIP_SH2_0_255(in0, in1)
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:642
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2448
static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:325
#define PCKEV_B2_SB(...)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static double alpha(void *priv, double x, double y)
Definition: vf_geq.c:99
static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:24
#define zero
Definition: regdef.h:64
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2464
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
#define ILVR_W2_SB(...)
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
static const uint8_t q0[256]
Definition: twofish.c:77
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define SLDI_B2_0_UB(...)
#define TRANSPOSE8x4_UB_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:2106
#define LD_UB8(...)
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1396
int32_t
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,p1_or_q1_org_in, q1_or_p1_org_in,negate_threshold_in, threshold_in,p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_msa.c:499
#define PCKEV_B4_SB(...)
#define ILVR_B4_UH(...)
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2528
#define ST_UB(...)
static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:69
#define LD_SB4(...)
#define src
Definition: vp9dsp.c:530
#define ST2x4_UB(in, stidx, pdst, stride)
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta)
Definition: h264dsp_msa.c:2490
#define PCKEV_B4_UB(...)
#define ILVL_B2_SH(...)
#define ST_UB4(...)
void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2512
#define ILVR_B2_UH(...)
#define src1
Definition: h264pred.c:139
#define ILVL_B4_SB(...)
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2456
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,q3_or_p3_org_in, p1_or_q1_org_in,p2_or_q2_org_in, q1_or_p1_org_in,p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_msa.c:448
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> in
BYTE int const BYTE int int int height
Definition: avisynth_c.h:676
static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:341
#define ILVR_B4_SH(...)
#define CLIP_SH(in, min, max)
static void avc_wgt_4width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:112
#define src0
Definition: h264pred.c:138
#define LD(psrc)
#define SH(val, pdst)
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in)
Definition: h264dsp_msa.c:1087
#define SW(val, pdst)
#define ST_SB4(...)
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2365
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2343
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:521
#define ILVR_W2_UB(...)
static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:266
void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2498
#define ILVL_W2_SB(...)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVRL_B2_SB(...)
static void avc_wgt_16width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:163
#define ST8x4_UB(in0, in1, pdst, stride)
#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)
#define ILVR_H2_SH(...)
#define SAT_UH2_UH(...)
#define PCKEV_B2_SW(...)
#define SAT_UH4_UH(...)
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2472
#define LD_UB(...)
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2287
#define ILVR_B4_SB(...)
#define ADDS_SH4_UH(...)
#define SRL_H4_UH(...)
#define stride
#define PCKEV_B2_UB(...)
#define ILVR_B2_UB(...)
static void avc_wgt_8width_msa(uint8_t *data, int32_t stride, int32_t height, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:124