FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hevc_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
23 
24 static const int16_t gt8x8_cnst[16] = {
25  64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
26 };
27 
28 static const int16_t gt16x16_cnst[64] = {
29  64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
30  64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
31  64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
32  64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
33 };
34 
35 static const int16_t gt32x32_cnst0[256] = {
36  90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
37  90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
38  88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
39  85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
40  82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
41  78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
42  73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
43  67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
44  61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
45  54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
46  46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
47  38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
48  31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
49  22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
50  13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
51  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
52 };
53 
54 static const int16_t gt32x32_cnst1[64] = {
55  90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
56  80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
57  57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
58  25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
59 };
60 
61 static const int16_t gt32x32_cnst2[16] = {
62  89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
63 };
64 
65 static const int16_t gt32x32_cnst3[16] = {
66  64, 64, 64, 64, 83, 36, -36, -83, 64, -64, -64, 64, 36, -83, 83, -36
67 };
68 
69 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
70  sum0, sum1, sum2, sum3, shift) \
71 { \
72  v4i32 vec0, vec1, vec2, vec3, vec4, vec5; \
73  v4i32 cnst64 = __msa_ldi_w(64); \
74  v4i32 cnst83 = __msa_ldi_w(83); \
75  v4i32 cnst36 = __msa_ldi_w(36); \
76  \
77  DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64, \
78  cnst83, cnst36, vec0, vec2, vec1, vec3); \
79  DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5); \
80  \
81  sum0 = vec0 + vec2; \
82  sum1 = vec0 - vec2; \
83  sum3 = sum0; \
84  sum2 = sum1; \
85  \
86  vec1 += vec3; \
87  vec4 -= vec5; \
88  \
89  sum0 += vec1; \
90  sum1 += vec4; \
91  sum2 -= vec4; \
92  sum3 -= vec1; \
93  \
94  SRARI_W4_SW(sum0, sum1, sum2, sum3, shift); \
95  SAT_SW4_SW(sum0, sum1, sum2, sum3, 15); \
96 }
97 
98 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
99 { \
100  v8i16 src0_r, src1_r, src2_r, src3_r; \
101  v8i16 src0_l, src1_l, src2_l, src3_l; \
102  v8i16 filt0, filter0, filter1, filter2, filter3; \
103  v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
104  v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
105  v4i32 sum0_r, sum1_r, sum2_r, sum3_r; \
106  v4i32 sum0_l, sum1_l, sum2_l, sum3_l; \
107  \
108  ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
109  src0_r, src1_r, src2_r, src3_r); \
110  ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
111  src0_l, src1_l, src2_l, src3_l); \
112  \
113  filt0 = LD_SH(filter); \
114  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
115  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
116  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
117  \
118  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
119  sum1_l, sum1_r); \
120  sum2_r = sum1_r; \
121  sum2_l = sum1_l; \
122  sum3_r = sum0_r; \
123  sum3_l = sum0_l; \
124  \
125  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
126  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
127  \
128  temp2_r += temp3_r; \
129  temp2_l += temp3_l; \
130  sum0_r += temp2_r; \
131  sum0_l += temp2_l; \
132  sum3_r -= temp2_r; \
133  sum3_l -= temp2_l; \
134  \
135  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
136  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
137  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7); \
138  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
139  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
140  \
141  temp4_r -= temp5_r; \
142  temp4_l -= temp5_l; \
143  sum1_r += temp4_r; \
144  sum1_l += temp4_l; \
145  sum2_r -= temp4_r; \
146  sum2_l -= temp4_l; \
147  \
148  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
149  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
150  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4); \
151  \
152  filt0 = LD_SH(filter + 8); \
153  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
154  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
155  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
156  \
157  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
158  sum1_l, sum1_r); \
159  sum2_r = sum1_r; \
160  sum2_l = sum1_l; \
161  sum3_r = sum0_r; \
162  sum3_l = sum0_l; \
163  \
164  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
165  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
166  \
167  temp2_r += temp3_r; \
168  temp2_l += temp3_l; \
169  sum0_r += temp2_r; \
170  sum0_l += temp2_l; \
171  sum3_r -= temp2_r; \
172  sum3_l -= temp2_l; \
173  \
174  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
175  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
176  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6); \
177  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
178  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
179  \
180  temp4_r -= temp5_r; \
181  temp4_l -= temp5_l; \
182  sum1_r -= temp4_r; \
183  sum1_l -= temp4_l; \
184  sum2_r += temp4_r; \
185  sum2_l += temp4_l; \
186  \
187  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
188  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
189  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5); \
190 }
191 
192 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
193  src4_r, src5_r, src6_r, src7_r, \
194  src0_l, src1_l, src2_l, src3_l, \
195  src4_l, src5_l, src6_l, src7_l, shift) \
196 { \
197  int16_t *ptr0, *ptr1; \
198  v8i16 filt0, filt1, dst0, dst1; \
199  v8i16 filter0, filter1, filter2, filter3; \
200  v4i32 temp0_r, temp1_r, temp0_l, temp1_l; \
201  v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
202  v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l; \
203  \
204  ptr0 = (buf_ptr + 112); \
205  ptr1 = (buf_ptr + 128); \
206  k = -1; \
207  \
208  for (j = 0; j < 4; j++) \
209  { \
210  LD_SH2(filter, 8, filt0, filt1) \
211  filter += 16; \
212  SPLATI_W2_SH(filt0, 0, filter0, filter1); \
213  SPLATI_W2_SH(filt1, 0, filter2, filter3); \
214  DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l, filter0, filter0, \
215  filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l); \
216  DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l); \
217  DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l, filter1, filter1, \
218  filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
219  DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l); \
220  \
221  sum1_r = sum0_r; \
222  sum1_l = sum0_l; \
223  \
224  SPLATI_W2_SH(filt0, 2, filter0, filter1); \
225  SPLATI_W2_SH(filt1, 2, filter2, filter3); \
226  DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l); \
227  DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l); \
228  DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l); \
229  \
230  sum0_r += temp0_r; \
231  sum0_l += temp0_l; \
232  sum1_r -= temp0_r; \
233  sum1_l -= temp0_l; \
234  \
235  sum3_r = temp1_r - sum3_r; \
236  sum3_l = temp1_l - sum3_l; \
237  \
238  DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l); \
239  DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3, \
240  filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
241  \
242  sum0_r += temp0_r; \
243  sum0_l += temp0_l; \
244  sum1_r -= temp0_r; \
245  sum1_l -= temp0_l; \
246  \
247  BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
248  res1_l, res1_r); \
249  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
250  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
251  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
252  ST_SH(dst0, buf_ptr); \
253  ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16))); \
254  \
255  BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
256  res1_l, res1_r); \
257  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
258  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
259  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
260  ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16))); \
261  ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16))); \
262  \
263  k *= -1; \
264  buf_ptr += 16; \
265  } \
266 }
267 
268 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
269 { \
270  LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l); \
271  tmp1_r = sum0_r; \
272  tmp1_l = sum0_l; \
273  sum0_r += tmp0_r; \
274  sum0_l += tmp0_l; \
275  ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4); \
276  tmp1_r -= tmp0_r; \
277  tmp1_l -= tmp0_l; \
278  ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4); \
279 }
280 
281 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
282  res0, res1, res2, res3, shift) \
283 { \
284  v4i32 vec0, vec1, vec2, vec3; \
285  v4i32 cnst74 = __msa_ldi_w(74); \
286  v4i32 cnst55 = __msa_ldi_w(55); \
287  v4i32 cnst29 = __msa_ldi_w(29); \
288  \
289  vec0 = in_r0 + in_r1; \
290  vec2 = in_r0 - in_l1; \
291  res0 = vec0 * cnst29; \
292  res1 = vec2 * cnst55; \
293  res2 = in_r0 - in_r1; \
294  vec1 = in_r1 + in_l1; \
295  res2 += in_l1; \
296  vec3 = in_l0 * cnst74; \
297  res3 = vec0 * cnst55; \
298  \
299  res0 += vec1 * cnst55; \
300  res1 -= vec1 * cnst29; \
301  res2 *= cnst74; \
302  res3 += vec2 * cnst29; \
303  \
304  res0 += vec3; \
305  res1 += vec3; \
306  res3 -= vec3; \
307  \
308  SRARI_W4_SW(res0, res1, res2, res3, shift); \
309  SAT_SW4_SW(res0, res1, res2, res3, 15); \
310 }
311 
312 static void hevc_idct_4x4_msa(int16_t *coeffs)
313 {
314  v8i16 in0, in1;
315  v4i32 in_r0, in_l0, in_r1, in_l1;
316  v4i32 sum0, sum1, sum2, sum3;
317  v8i16 zeros = { 0 };
318 
319  LD_SH2(coeffs, 8, in0, in1);
320  ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
321  ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
322 
323  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
324  TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
325  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
326  TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, sum0, sum1, sum2, sum3);
327  PCKEV_H2_SH(sum1, sum0, sum3, sum2, in0, in1);
328  ST_SH2(in0, in1, coeffs, 8);
329 }
330 
331 static void hevc_idct_8x8_msa(int16_t *coeffs)
332 {
333  int16_t *filter = &gt8x8_cnst[0];
334  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
335 
336  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
337  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
338  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
339  in0, in1, in2, in3, in4, in5, in6, in7);
340  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
341  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
342  in0, in1, in2, in3, in4, in5, in6, in7);
343  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
344 }
345 
346 static void hevc_idct_16x16_msa(int16_t *coeffs)
347 {
348  int16_t i, j, k;
349  int16_t buf[256];
350  int16_t *buf_ptr = &buf[0];
351  int16_t *src = coeffs;
352  int16_t *filter = &gt16x16_cnst[0];
353  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
355  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
356  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
357  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
358 
359  for (i = 2; i--;) {
360  LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
361  in8, in9, in10, in11, in12, in13, in14, in15);
362 
363  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
364  src0_r, src1_r, src2_r, src3_r);
365  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
366  src4_r, src5_r, src6_r, src7_r);
367  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
368  src0_l, src1_l, src2_l, src3_l);
369  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
370  src4_l, src5_l, src6_l, src7_l);
371  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
372  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
373  src4_l, src5_l, src6_l, src7_l, 7);
374 
375  src += 8;
376  buf_ptr = (&buf[0] + 8);
377  filter = &gt16x16_cnst[0];
378  }
379 
380  src = &buf[0];
381  buf_ptr = coeffs;
382  filter = &gt16x16_cnst[0];
383 
384  for (i = 2; i--;) {
385  LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
386  in4, in12, in5, in13, in6, in14, in7, in15);
387  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388  in0, in1, in2, in3, in4, in5, in6, in7);
389  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
390  in8, in9, in10, in11, in12, in13, in14, in15);
391  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
392  src0_r, src1_r, src2_r, src3_r);
393  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
394  src4_r, src5_r, src6_r, src7_r);
395  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
396  src0_l, src1_l, src2_l, src3_l);
397  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
398  src4_l, src5_l, src6_l, src7_l);
399  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
400  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
401  src4_l, src5_l, src6_l, src7_l, 12);
402 
403  src += 128;
404  buf_ptr = coeffs + 8;
405  filter = &gt16x16_cnst[0];
406  }
407 
408  LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
409  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
410  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
411  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
412 
413  LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
414  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
415  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
416  LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
417  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
418  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
419  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
420  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
421 
422  LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
423  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
424  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
425  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
426 }
427 
428 static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
429  uint8_t round)
430 {
431  uint8_t i;
432  int16_t *filter_ptr0 = &gt32x32_cnst0[0];
433  int16_t *filter_ptr1 = &gt32x32_cnst1[0];
434  int16_t *filter_ptr2 = &gt32x32_cnst2[0];
435  int16_t *filter_ptr3 = &gt32x32_cnst3[0];
436  int16_t *src0 = (coeffs + buf_pitch);
437  int16_t *src1 = (coeffs + 2 * buf_pitch);
438  int16_t *src2 = (coeffs + 4 * buf_pitch);
439  int16_t *src3 = (coeffs);
440  int32_t cnst0, cnst1;
441  int32_t tmp_buf[8 * 32];
442  int32_t *tmp_buf_ptr = &tmp_buf[0];
443  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
444  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
445  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
446  v8i16 filt0, filter0, filter1, filter2, filter3;
447  v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
448 
449  /* process coeff 4, 12, 20, 28 */
450  LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
451  ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
452  ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
453 
454  /* loop for all columns of constants */
455  for (i = 0; i < 4; i++) {
456  /* processing single column of constants */
457  cnst0 = LW(filter_ptr2);
458  cnst1 = LW(filter_ptr2 + 2);
459 
460  filter0 = (v8i16) __msa_fill_w(cnst0);
461  filter1 = (v8i16) __msa_fill_w(cnst1);
462 
463  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
464  DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
465  ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + i * 8), 4);
466 
467  filter_ptr2 += 4;
468  }
469 
470  /* process coeff 0, 8, 16, 24 */
471  LD_SH2(src3, 16 * buf_pitch, in0, in2);
472  LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in1, in3);
473 
474  ILVR_H2_SH(in2, in0, in3, in1, src0_r, src1_r);
475  ILVL_H2_SH(in2, in0, in3, in1, src0_l, src1_l);
476 
477  /* loop for all columns of constants */
478  for (i = 0; i < 2; i++) {
479  /* processing first column of filter constants */
480  cnst0 = LW(filter_ptr3);
481  cnst1 = LW(filter_ptr3 + 4);
482 
483  filter0 = (v8i16) __msa_fill_w(cnst0);
484  filter1 = (v8i16) __msa_fill_w(cnst1);
485 
486  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, filter1,
487  filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
488 
489  sum1_r = sum0_r;
490  sum1_l = sum0_l;
491  sum0_r += tmp1_r;
492  sum0_l += tmp1_l;
493 
494  sum1_r -= tmp1_r;
495  sum1_l -= tmp1_l;
496 
497  HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
498  HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
499 
500  filter_ptr3 += 8;
501  }
502 
503  /* process coeff 2 6 10 14 18 22 26 30 */
504  LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
505  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
506  src0_r, src1_r, src2_r, src3_r);
507  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
508  src0_l, src1_l, src2_l, src3_l);
509 
510  /* loop for all columns of constants */
511  for (i = 0; i < 8; i++) {
512  /* processing single column of constants */
513  filt0 = LD_SH(filter_ptr1);
514  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
515  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
516  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
517  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
518  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
519 
520  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
521  tmp1_r = tmp0_r;
522  tmp1_l = tmp0_l;
523  tmp0_r += sum0_r;
524  tmp0_l += sum0_l;
525  ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
526  tmp1_r -= sum0_r;
527  tmp1_l -= sum0_l;
528  ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
529 
530  filter_ptr1 += 8;
531  }
532 
533  /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
534  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
535  src0 += 16 * buf_pitch;
536  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
537  src0_r, src1_r, src2_r, src3_r);
538  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
539  src0_l, src1_l, src2_l, src3_l);
540 
541  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
542  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
543  src4_r, src5_r, src6_r, src7_r);
544  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
545  src4_l, src5_l, src6_l, src7_l);
546 
547  /* loop for all columns of filter constants */
548  for (i = 0; i < 16; i++) {
549  /* processing single column of constants */
550  filt0 = LD_SH(filter_ptr0);
551  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
552  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
553  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
554  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
555  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
556 
557  tmp1_r = sum0_r;
558  tmp1_l = sum0_l;
559 
560  filt0 = LD_SH(filter_ptr0 + 8);
561  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
562  DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
563  DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
564  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
565  DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
566 
567  sum0_r += tmp1_r;
568  sum0_l += tmp1_l;
569 
570  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
571  tmp1_r = tmp0_r;
572  tmp1_l = tmp0_l;
573  tmp0_r += sum0_r;
574  tmp0_l += sum0_l;
575  sum1_r = __msa_fill_w(round);
576  SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
577  SAT_SW2_SW(tmp0_r, tmp0_l, 15);
578  in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
579  ST_SH(in0, (coeffs + i * buf_pitch));
580  tmp1_r -= sum0_r;
581  tmp1_l -= sum0_l;
582  SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
583  SAT_SW2_SW(tmp1_r, tmp1_l, 15);
584  in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
585  ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
586 
587  filter_ptr0 += 16;
588  }
589 }
590 
591 static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
592 {
593  uint8_t i;
594  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
595 
596  for (i = 0; i < 4; i++) {
597  LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
598  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
599  in0, in1, in2, in3, in4, in5, in6, in7);
600  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
601  }
602 }
603 
604 static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
605 {
606  uint8_t i;
607  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
608 
609  for (i = 0; i < 4; i++) {
610  LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
611  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
612  in0, in1, in2, in3, in4, in5, in6, in7);
613  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
614  }
615 }
616 
617 static void hevc_idct_32x32_msa(int16_t *coeffs)
618 {
619  uint8_t row_cnt, col_cnt;
620  int16_t *src = coeffs;
621  int16_t tmp_buf[8 * 32];
622  int16_t *tmp_buf_ptr = &tmp_buf[0];
623  uint8_t round;
624  uint8_t buf_pitch;
625 
626  /* column transform */
627  round = 7;
628  buf_pitch = 32;
629  for (col_cnt = 0; col_cnt < 4; col_cnt++) {
630  /* process 8x32 blocks */
631  hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
632  }
633 
634  /* row transform */
635  round = 12;
636  buf_pitch = 8;
637  for (row_cnt = 0; row_cnt < 4; row_cnt++) {
638  /* process 32x8 blocks */
639  src = (coeffs + 32 * 8 * row_cnt);
640 
641  hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
642  hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
643  hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
644  }
645 }
646 
647 static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
648 {
649  int32_t val;
650  v8i16 dst;
651 
652  val = (coeffs[0] + 1) >> 1;
653  val = (val + 32) >> 6;
654  dst = __msa_fill_h(val);
655 
656  ST_SH2(dst, dst, coeffs, 8);
657 }
658 
659 static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
660 {
661  int32_t val;
662  v8i16 dst;
663 
664  val = (coeffs[0] + 1) >> 1;
665  val = (val + 32) >> 6;
666  dst = __msa_fill_h(val);
667 
668  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
669 }
670 
671 static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
672 {
673  uint8_t loop;
674  int32_t val;
675  v8i16 dst;
676 
677  val = (coeffs[0] + 1) >> 1;
678  val = (val + 32) >> 6;
679  dst = __msa_fill_h(val);
680 
681  for (loop = 4; loop--;) {
682  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
683  coeffs += 8 * 8;
684  }
685 }
686 
687 static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
688 {
689  uint8_t loop;
690  int32_t val;
691  v8i16 dst;
692 
693  val = (coeffs[0] + 1) >> 1;
694  val = (val + 32) >> 6;
695  dst = __msa_fill_h(val);
696 
697  for (loop = 16; loop--;) {
698  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
699  coeffs += 8 * 8;
700  }
701 }
702 
703 static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
704 {
705  uint32_t dst0, dst1, dst2, dst3;
706  v8i16 dst_r0, dst_l0, in0, in1;
707  v4i32 dst_vec = { 0 };
708  v16u8 zeros = { 0 };
709 
710  LD_SH2(coeffs, 8, in0, in1);
711  LW4(dst, stride, dst0, dst1, dst2, dst3);
712  INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
713  ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
714  ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
715  CLIP_SH2_0_255(dst_r0, dst_l0);
716  dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
717  ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
718 }
719 
720 static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
721 {
722  uint8_t *temp_dst = dst;
723  uint64_t dst0, dst1, dst2, dst3;
724  v2i64 dst_vec0 = { 0 };
725  v2i64 dst_vec1 = { 0 };
726  v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
727  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
728  v16u8 zeros = { 0 };
729 
730  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
731  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
732  temp_dst += (4 * stride);
733 
734  INSERT_D2_SD(dst0, dst1, dst_vec0);
735  INSERT_D2_SD(dst2, dst3, dst_vec1);
736  ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
737  ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
738  ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
739  dst_r0, dst_l0, dst_r1, dst_l1);
740  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
741  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
742  ST8x4_UB(dst_r0, dst_r1, dst, stride);
743  dst += (4 * stride);
744 
745  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
746  INSERT_D2_SD(dst0, dst1, dst_vec0);
747  INSERT_D2_SD(dst2, dst3, dst_vec1);
748  UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
749  UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
750  ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
751  dst_r0, dst_l0, dst_r1, dst_l1);
752  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
753  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
754  ST8x4_UB(dst_r0, dst_r1, dst, stride);
755 }
756 
757 static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
758 {
759  uint8_t loop_cnt;
760  uint8_t *temp_dst = dst;
761  v16u8 dst0, dst1, dst2, dst3;
762  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
763  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
764 
765  for (loop_cnt = 4; loop_cnt--;) {
766  LD_SH4(coeffs, 16, in0, in2, in4, in6);
767  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
768  coeffs += 64;
769  LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
770  temp_dst += (4 * stride);
771 
772  UNPCK_UB_SH(dst0, dst_r0, dst_l0);
773  UNPCK_UB_SH(dst1, dst_r1, dst_l1);
774  UNPCK_UB_SH(dst2, dst_r2, dst_l2);
775  UNPCK_UB_SH(dst3, dst_r3, dst_l3);
776 
777  dst_r0 += in0;
778  dst_l0 += in1;
779  dst_r1 += in2;
780  dst_l1 += in3;
781  dst_r2 += in4;
782  dst_l2 += in5;
783  dst_r3 += in6;
784  dst_l3 += in7;
785 
786  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
787  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
788  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
789  dst_r3, dst0, dst1, dst2, dst3);
790  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
791  dst += (4 * stride);
792  }
793 }
794 
795 static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
796 {
797  uint8_t loop_cnt;
798  uint8_t *temp_dst = dst;
799  v16u8 dst0, dst1, dst2, dst3;
800  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
801  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
802 
803  for (loop_cnt = 8; loop_cnt--;) {
804  LD_SH4(coeffs, 32, in0, in2, in4, in6);
805  LD_SH4((coeffs + 8), 32, in1, in3, in5, in7);
806  LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
807 
808  UNPCK_UB_SH(dst0, dst_r0, dst_l0);
809  UNPCK_UB_SH(dst1, dst_r1, dst_l1);
810  UNPCK_UB_SH(dst2, dst_r2, dst_l2);
811  UNPCK_UB_SH(dst3, dst_r3, dst_l3);
812 
813  dst_r0 += in0;
814  dst_l0 += in1;
815  dst_r1 += in2;
816  dst_l1 += in3;
817  dst_r2 += in4;
818  dst_l2 += in5;
819  dst_r3 += in6;
820  dst_l3 += in7;
821 
822  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
823  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
824  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
825  dst_r3, dst0, dst1, dst2, dst3);
826  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
827 
828  LD_SH4((coeffs + 16), 32, in0, in2, in4, in6);
829  LD_SH4((coeffs + 24), 32, in1, in3, in5, in7);
830  coeffs += 128;
831  LD_UB4((temp_dst + 16), stride, dst0, dst1, dst2, dst3);
832  temp_dst += (4 * stride);
833 
834  UNPCK_UB_SH(dst0, dst_r0, dst_l0);
835  UNPCK_UB_SH(dst1, dst_r1, dst_l1);
836  UNPCK_UB_SH(dst2, dst_r2, dst_l2);
837  UNPCK_UB_SH(dst3, dst_r3, dst_l3);
838 
839  dst_r0 += in0;
840  dst_l0 += in1;
841  dst_r1 += in2;
842  dst_l1 += in3;
843  dst_r2 += in4;
844  dst_l2 += in5;
845  dst_r3 += in6;
846  dst_l3 += in7;
847 
848  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
849  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
850  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
851  dst_r3, dst0, dst1, dst2, dst3);
852 
853  ST_UB4(dst0, dst1, dst2, dst3, (dst + 16), stride);
854  dst += (4 * stride);
855  }
856 }
857 
858 static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
859 {
860  v8i16 in0, in1, dst0, dst1;
861  v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
862 
863  LD_SH2(coeffs, 8, in0, in1);
864  UNPCK_SH_SW(in0, in_r0, in_l0);
865  UNPCK_SH_SW(in1, in_r1, in_l1);
866  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
867  7);
868  TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
869  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
870  12);
871  TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, res0, res1, res2, res3);
872  PCKEV_H2_SH(res1, res0, res3, res2, dst0, dst1);
873  ST_SH2(dst0, dst1, coeffs, 8);
874 }
875 
876 void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
877 {
878  hevc_idct_4x4_msa(coeffs);
879 }
880 
881 void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
882 {
883  hevc_idct_8x8_msa(coeffs);
884 }
885 
886 void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
887 {
888  hevc_idct_16x16_msa(coeffs);
889 }
890 
891 void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
892 {
893  hevc_idct_32x32_msa(coeffs);
894 }
895 
896 void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
897 {
898  hevc_addblk_4x4_msa(coeffs, dst, stride);
899 }
900 
901 void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
902 {
903  hevc_addblk_8x8_msa(coeffs, dst, stride);
904 }
905 
906 void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
907 {
908  hevc_addblk_16x16_msa(coeffs, dst, stride);
909 }
910 
911 void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
912 {
913  hevc_addblk_32x32_msa(coeffs, dst, stride);
914 }
915 
917 {
918  hevc_idct_dc_4x4_msa(coeffs);
919 }
920 
922 {
923  hevc_idct_dc_8x8_msa(coeffs);
924 }
925 
927 {
928  hevc_idct_dc_16x16_msa(coeffs);
929 }
930 
932 {
933  hevc_idct_dc_32x32_msa(coeffs);
934 }
935 
937 {
938  hevc_idct_luma_4x4_msa(coeffs);
939 }
static const int16_t gt32x32_cnst1[64]
Definition: hevc_idct_msa.c:54
const char const char void * val
Definition: avisynth_c.h:634
static void hevc_idct_8x8_msa(int16_t *coeffs)
void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,res0, res1, res2, res3, shift)
#define ILVRL_B2_SH(...)
#define LD_SH16(...)
#define ILVR_H4_SH(...)
void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define PCKEV_B2_SH(...)
#define LW(psrc)
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
static void filter1(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:358
#define ILVRL_H2_SW(...)
#define LD_UB4(...)
#define ILVL_H2_SH(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
#define DOTP_SH4_SW(...)
static void filter0(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:350
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define ST_SH2(...)
#define UNPCK_UB_SH(in, out0, out1)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
Definition: cfhd.c:82
#define DOTP_SH2_SW(...)
#define LD_SH(...)
void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
#define CLIP_SH2_0_255(in0, in1)
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
static const int16_t gt32x32_cnst0[256]
Definition: hevc_idct_msa.c:35
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
#define SAT_SW2_SW(...)
#define DPADD_SH4_SW(...)
static void hevc_idct_16x16_msa(int16_t *coeffs)
static const int16_t gt8x8_cnst[16]
Definition: hevc_idct_msa.c:24
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
static av_always_inline av_const double round(double x)
Definition: libm.h:444
#define TRANSPOSE8x8_SH_SH(...)
#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,sum0, sum1, sum2, sum3, shift)
Definition: hevc_idct_msa.c:69
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define SPLATI_W4_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define LD_SH8(...)
static const int16_t gt32x32_cnst2[16]
Definition: hevc_idct_msa.c:61
static void hevc_idct_32x32_msa(int16_t *coeffs)
void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
static const int16_t gt32x32_cnst3[16]
Definition: hevc_idct_msa.c:65
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
#define LD_SH2(...)
int32_t
#define PCKEV_H2_SH(...)
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round)
#define src
Definition: vp9dsp.c:530
#define PCKEV_B4_UB(...)
#define ST_SW2(in0, in1, pdst, stride)
#define INSERT_D2_SD(...)
#define ST_UB4(...)
#define src1
Definition: h264pred.c:139
#define UNPCK_SH_SW(in, out0, out1)
void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
static int loop
Definition: ffplay.c:339
#define INSERT_W4_SW(...)
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
#define SRAR_W2_SW(...)
void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
void * buf
Definition: avisynth_c.h:553
static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
#define ST_SH(...)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define DPADD_SH2_SW(...)
#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,src4_r, src5_r, src6_r, src7_r,src0_l, src1_l, src2_l, src3_l,src4_l, src5_l, src6_l, src7_l, shift)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST_SH8(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
#define LD_SH4(...)
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
static const int16_t coeffs[]
#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)
Definition: hevc_idct_msa.c:98
static void hevc_idct_4x4_msa(int16_t *coeffs)
static const int16_t gt16x16_cnst[64]
Definition: hevc_idct_msa.c:28
#define LD_SW2(psrc, stride, out0, out1)
static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
#define stride
static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)