30 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
31 v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7;
32 v4i32 in_l0, in_l1, in_l2, in_l3, in_l4, in_l5, in_l6, in_l7;
33 v4i32 t_r1, t_r2, t_r3, t_r4, t_r5, t_r6, t_r7, t_r8;
34 v4i32 t_l1, t_l2, t_l3, t_l4, t_l5, t_l6, t_l7, t_l8;
35 v4i32 cnst_12 = {12, 12, 12, 12};
36 v4i32 cnst_4 = {4, 4, 4, 4};
37 v4i32 cnst_16 = {16, 16, 16, 16};
38 v4i32 cnst_6 = {6, 6, 6, 6};
39 v4i32 cnst_15 = {15, 15, 15, 15};
40 v4i32 cnst_9 = {9, 9, 9, 9};
41 v4i32 cnst_1 = {1, 1, 1, 1};
42 v4i32 cnst_64 = {64, 64, 64, 64};
44 LD_SH8(
block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
54 t_r1 = cnst_12 * (in_r0 + in_r4) + cnst_4;
55 t_l1 = cnst_12 * (in_l0 + in_l4) + cnst_4;
56 t_r2 = cnst_12 * (in_r0 - in_r4) + cnst_4;
57 t_l2 = cnst_12 * (in_l0 - in_l4) + cnst_4;
58 t_r3 = cnst_16 * in_r2 + cnst_6 * in_r6;
59 t_l3 = cnst_16 * in_l2 + cnst_6 * in_l6;
60 t_r4 = cnst_6 * in_r2 - cnst_16 * in_r6;
61 t_l4 = cnst_6 * in_l2 - cnst_16 * in_l6;
63 ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6);
64 SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8);
65 t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7;
66 t_l1 = cnst_16 * in_l1 + cnst_15 * in_l3 + cnst_9 * in_l5 + cnst_4 * in_l7;
67 t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7;
68 t_l2 = cnst_15 * in_l1 - cnst_4 * in_l3 - cnst_16 * in_l5 - cnst_9 * in_l7;
69 t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7;
70 t_l3 = cnst_9 * in_l1 - cnst_16 * in_l3 + cnst_4 * in_l5 + cnst_15 * in_l7;
71 t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7;
72 t_l4 = cnst_4 * in_l1 - cnst_9 * in_l3 + cnst_15 * in_l5 - cnst_16 * in_l7;
74 in_r0 = (t_r5 + t_r1) >> 3;
75 in_l0 = (t_l5 + t_l1) >> 3;
76 in_r1 = (t_r6 + t_r2) >> 3;
77 in_l1 = (t_l6 + t_l2) >> 3;
78 in_r2 = (t_r7 + t_r3) >> 3;
79 in_l2 = (t_l7 + t_l3) >> 3;
80 in_r3 = (t_r8 + t_r4) >> 3;
81 in_l3 = (t_l8 + t_l4) >> 3;
83 in_r4 = (t_r8 - t_r4) >> 3;
84 in_l4 = (t_l8 - t_l4) >> 3;
85 in_r5 = (t_r7 - t_r3) >> 3;
86 in_l5 = (t_l7 - t_l3) >> 3;
87 in_r6 = (t_r6 - t_r2) >> 3;
88 in_l6 = (t_l6 - t_l2) >> 3;
89 in_r7 = (t_r5 - t_r1) >> 3;
90 in_l7 = (t_l5 - t_l1) >> 3;
96 t_r1 = cnst_12 * (in_r0 + in_l0) + cnst_64;
97 t_l1 = cnst_12 * (in_r4 + in_l4) + cnst_64;
98 t_r2 = cnst_12 * (in_r0 - in_l0) + cnst_64;
99 t_l2 = cnst_12 * (in_r4 - in_l4) + cnst_64;
100 t_r3 = cnst_16 * in_r2 + cnst_6 * in_l2;
101 t_l3 = cnst_16 * in_r6 + cnst_6 * in_l6;
102 t_r4 = cnst_6 * in_r2 - cnst_16 * in_l2;
103 t_l4 = cnst_6 * in_r6 - cnst_16 * in_l6;
105 ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6);
106 SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8);
107 t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_l1 + cnst_4 * in_l3;
108 t_l1 = cnst_16 * in_r5 + cnst_15 * in_r7 + cnst_9 * in_l5 + cnst_4 * in_l7;
109 t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_l1 - cnst_9 * in_l3;
110 t_l2 = cnst_15 * in_r5 - cnst_4 * in_r7 - cnst_16 * in_l5 - cnst_9 * in_l7;
111 t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_l1 + cnst_15 * in_l3;
112 t_l3 = cnst_9 * in_r5 - cnst_16 * in_r7 + cnst_4 * in_l5 + cnst_15 * in_l7;
113 t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_l1 - cnst_16 * in_l3;
114 t_l4 = cnst_4 * in_r5 - cnst_9 * in_r7 + cnst_15 * in_l5 - cnst_16 * in_l7;
116 in_r0 = (t_r5 + t_r1) >> 7;
117 in_l0 = (t_l5 + t_l1) >> 7;
118 in_r1 = (t_r6 + t_r2) >> 7;
119 in_l1 = (t_l6 + t_l2) >> 7;
120 in_r2 = (t_r7 + t_r3) >> 7;
121 in_l2 = (t_l7 + t_l3) >> 7;
122 in_r3 = (t_r8 + t_r4) >> 7;
123 in_l3 = (t_l8 + t_l4) >> 7;
125 in_r4 = (t_r8 - t_r4 + cnst_1) >> 7;
126 in_l4 = (t_l8 - t_l4 + cnst_1) >> 7;
127 in_r5 = (t_r7 - t_r3 + cnst_1) >> 7;
128 in_l5 = (t_l7 - t_l3 + cnst_1) >> 7;
129 in_r6 = (t_r6 - t_r2 + cnst_1) >> 7;
130 in_l6 = (t_l6 - t_l2 + cnst_1) >> 7;
131 in_r7 = (t_r5 - t_r1 + cnst_1) >> 7;
132 in_l7 = (t_l5 - t_l1 + cnst_1) >> 7;
133 PCKEV_H4_SH(in_l0, in_r0, in_l1, in_r1, in_l2, in_r2, in_l3, in_r3,
135 PCKEV_H4_SH(in_l4, in_r4, in_l5, in_r5, in_l6, in_r6, in_l7, in_r7,
137 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7,
block, 8);
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7;
145 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
146 v16i8 zero_m = { 0 };
147 v4i32 cnst_17 = {17, 17, 17, 17};
148 v4i32 cnst_22 = {22, 22, 22, 22};
149 v4i32 cnst_10 = {10, 10, 10, 10};
150 v4i32 cnst_12 = {12, 12, 12, 12};
151 v4i32 cnst_64 = {64, 64, 64, 64};
152 v4i32 cnst_16 = {16, 16, 16, 16};
153 v4i32 cnst_15 = {15, 15, 15, 15};
154 v4i32 cnst_4 = {4, 4, 4, 4};
155 v4i32 cnst_6 = {6, 6, 6, 6};
156 v4i32 cnst_9 = {9, 9, 9, 9};
157 v4i32 cnst_1 = {1, 1, 1, 1};
159 LD_SH8(
block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
171 t1 = cnst_17 * (in_r0 + in_r2) + cnst_4;
172 t5 = cnst_17 * (in_r4 + in_r6) + cnst_4;
173 t2 = cnst_17 * (in_r0 - in_r2) + cnst_4;
174 t6 = cnst_17 * (in_r4 - in_r6) + cnst_4;
175 t3 = cnst_22 * in_r1 + cnst_10 * in_r3;
176 t7 = cnst_22 * in_r5 + cnst_10 * in_r7;
177 t4 = cnst_22 * in_r3 - cnst_10 * in_r1;
178 t8 = cnst_22 * in_r7 - cnst_10 * in_r5;
180 in_r0 = (
t1 +
t3) >> 3;
181 in_r4 = (
t5 +
t7) >> 3;
182 in_r1 = (
t2 -
t4) >> 3;
183 in_r5 = (
t6 -
t8) >> 3;
184 in_r2 = (
t2 +
t4) >> 3;
185 in_r6 = (
t6 +
t8) >> 3;
186 in_r3 = (
t1 -
t3) >> 3;
187 in_r7 = (
t5 -
t7) >> 3;
190 PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6,
192 ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1,
block, 8);
194 t1 = cnst_12 * (in_r0 + in_r4) + cnst_64;
195 t2 = cnst_12 * (in_r0 - in_r4) + cnst_64;
196 t3 = cnst_16 * in_r2 + cnst_6 * in_r6;
197 t4 = cnst_6 * in_r2 - cnst_16 * in_r6;
200 t1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7;
201 t2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7;
202 t3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7;
203 t4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7;
204 LD_SW8(dest, linesize, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
205 ILVR_B8_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
206 zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7,
207 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
208 ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
209 dst0, dst1, dst2, dst3);
210 ILVR_H4_SW(zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7,
211 dst4, dst5, dst6, dst7);
212 in_r0 = (
t5 +
t1) >> 7;
213 in_r1 = (
t6 +
t2) >> 7;
214 in_r2 = (
t7 +
t3) >> 7;
215 in_r3 = (
t8 +
t4) >> 7;
216 in_r4 = (
t8 -
t4 + cnst_1) >> 7;
217 in_r5 = (
t7 -
t3 + cnst_1) >> 7;
218 in_r6 = (
t6 -
t2 + cnst_1) >> 7;
219 in_r7 = (
t5 -
t1 + cnst_1) >> 7;
220 ADD4(in_r0, dst0, in_r1, dst1, in_r2, dst2, in_r3, dst3,
221 in_r0, in_r1, in_r2, in_r3);
222 ADD4(in_r4, dst4, in_r5, dst5, in_r6, dst6, in_r7, dst7,
223 in_r4, in_r5, in_r6, in_r7);
224 CLIP_SW8_0_255(in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7);
225 PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6,
228 ST_W8(in0, in1, 0, 1, 2, 3, 0, 1, 2, 3, dest, linesize);
233 v4i32 in0, in1, in2, in3, in4, in5, in6, in7;
235 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
236 v16i8 zero_m = { 0 };
237 v4i32 cnst_17 = {17, 17, 17, 17};
238 v4i32 cnst_22 = {22, 22, 22, 22};
239 v4i32 cnst_10 = {10, 10, 10, 10};
240 v4i32 cnst_12 = {12, 12, 12, 12};
241 v4i32 cnst_64 = {64, 64, 64, 64};
242 v4i32 cnst_16 = {16, 16, 16, 16};
243 v4i32 cnst_15 = {15, 15, 15, 15};
244 v4i32 cnst_4 = {4, 4, 4, 4};
245 v4i32 cnst_6 = {6, 6, 6, 6};
246 v4i32 cnst_9 = {9, 9, 9, 9};
256 t1 = cnst_12 * (in0 + in4) + cnst_4;
257 t2 = cnst_12 * (in0 - in4) + cnst_4;
258 t3 = cnst_16 * in2 + cnst_6 * in6;
259 t4 = cnst_6 * in2 - cnst_16 * in6;
262 t1 = cnst_16 * in1 + cnst_15 * in3 + cnst_9 * in5 + cnst_4 * in7;
263 t2 = cnst_15 * in1 - cnst_4 * in3 - cnst_16 * in5 - cnst_9 * in7;
264 t3 = cnst_9 * in1 - cnst_16 * in3 + cnst_4 * in5 + cnst_15 * in7;
265 t4 = cnst_4 * in1 - cnst_9 * in3 + cnst_15 * in5 - cnst_16 * in7;
266 in0 = (
t5 +
t1) >> 3;
267 in1 = (
t6 +
t2) >> 3;
268 in2 = (
t7 +
t3) >> 3;
269 in3 = (
t8 +
t4) >> 3;
270 in4 = (
t8 -
t4) >> 3;
271 in5 = (
t7 -
t3) >> 3;
272 in6 = (
t6 -
t2) >> 3;
273 in7 = (
t5 -
t1) >> 3;
276 PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3,
t1,
t2,
t3,
t4);
279 LD_SW4(dest, linesize, dst0, dst1, dst2, dst3);
280 ILVR_B4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
281 dst0, dst1, dst2, dst3);
282 ILVL_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
283 dst4, dst5, dst6, dst7);
284 ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
285 dst0, dst1, dst2, dst3);
287 t1 = cnst_17 * (in0 + in2) + cnst_64;
288 t2 = cnst_17 * (in0 - in2) + cnst_64;
289 t3 = cnst_22 * in1 + cnst_10 * in3;
290 t4 = cnst_22 * in3 - cnst_10 * in1;
291 in0 = (
t1 +
t3) >> 7;
292 in1 = (
t2 -
t4) >> 7;
293 in2 = (
t2 +
t4) >> 7;
294 in3 = (
t1 -
t3) >> 7;
295 ADD4(in0, dst0, in1, dst1, in2, dst2, in3, dst3, in0, in1, in2, in3);
298 t5 = cnst_17 * (in4 + in6) + cnst_64;
299 t6 = cnst_17 * (in4 - in6) + cnst_64;
300 t7 = cnst_22 * in5 + cnst_10 * in7;
301 t8 = cnst_22 * in7 - cnst_10 * in5;
302 in4 = (
t5 +
t7) >> 7;
303 in5 = (
t6 -
t8) >> 7;
304 in6 = (
t6 +
t8) >> 7;
305 in7 = (
t5 -
t7) >> 7;
306 ADD4(in4, dst4, in5, dst5, in6, dst6, in7, dst7, in4, in5, in6, in7);
308 PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3, in0, in1, in2, in3);
310 ST_D4(in0, in1, 0, 1, 0, 1, dest, linesize);
314 ptrdiff_t
stride,
int hmode,
int vmode,
317 v8i16 in_r0, in_r1, in_r2, in_r3, in_l0, in_l1, in_l2, in_l3;
320 v8i16 cnst_para0, cnst_para1, cnst_para2, cnst_para3, cnst_r;
321 static const int para_value[][4] = {{4, 53, 18, 3},
324 static const int shift_value[] = {0, 5, 1, 5};
325 int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
327 cnst_r = __msa_fill_h(
r);
329 cnst_para0 = __msa_fill_h(para_value[vmode - 1][0]);
330 cnst_para1 = __msa_fill_h(para_value[vmode - 1][1]);
331 cnst_para2 = __msa_fill_h(para_value[vmode - 1][2]);
332 cnst_para3 = __msa_fill_h(para_value[vmode - 1][3]);
339 t0 = cnst_para1 * in_r1 + cnst_para2 * in_r2
340 - cnst_para0 * in_r0 - cnst_para3 * in_r3;
341 t8 = cnst_para1 * in_l1 + cnst_para2 * in_l2
342 - cnst_para0 * in_l0 - cnst_para3 * in_l3;
346 t1 = cnst_para1 * in_r2 + cnst_para2 * in_r3
347 - cnst_para0 * in_r1 - cnst_para3 * in_r0;
348 t9 = cnst_para1 * in_l2 + cnst_para2 * in_l3
349 - cnst_para0 * in_l1 - cnst_para3 * in_l0;
353 t2 = cnst_para1 * in_r3 + cnst_para2 * in_r0
354 - cnst_para0 * in_r2 - cnst_para3 * in_r1;
355 t10 = cnst_para1 * in_l3 + cnst_para2 * in_l0
356 - cnst_para0 * in_l2 - cnst_para3 * in_l1;
360 t3 = cnst_para1 * in_r0 + cnst_para2 * in_r1
361 - cnst_para0 * in_r3 - cnst_para3 * in_r2;
362 t11 = cnst_para1 * in_l0 + cnst_para2 * in_l1
363 - cnst_para0 * in_l3 - cnst_para3 * in_l2;
367 t4 = cnst_para1 * in_r1 + cnst_para2 * in_r2
368 - cnst_para0 * in_r0 - cnst_para3 * in_r3;
369 t12 = cnst_para1 * in_l1 + cnst_para2 * in_l2
370 - cnst_para0 * in_l0 - cnst_para3 * in_l3;
374 t5 = cnst_para1 * in_r2 + cnst_para2 * in_r3
375 - cnst_para0 * in_r1 - cnst_para3 * in_r0;
376 t13 = cnst_para1 * in_l2 + cnst_para2 * in_l3
377 - cnst_para0 * in_l1 - cnst_para3 * in_l0;
381 t6 = cnst_para1 * in_r3 + cnst_para2 * in_r0
382 - cnst_para0 * in_r2 - cnst_para3 * in_r1;
383 t14 = cnst_para1 * in_l3 + cnst_para2 * in_l0
384 - cnst_para0 * in_l2 - cnst_para3 * in_l1;
388 t7 = cnst_para1 * in_r0 + cnst_para2 * in_r1
389 - cnst_para0 * in_r3 - cnst_para3 * in_r2;
390 t15 = cnst_para1 * in_l0 + cnst_para2 * in_l1
391 - cnst_para0 * in_l3 - cnst_para3 * in_l2;
393 ADD4(
t0, cnst_r,
t1, cnst_r,
t2, cnst_r,
t3, cnst_r,
t0,
t1,
t2,
t3);
394 ADD4(
t4, cnst_r,
t5, cnst_r,
t6, cnst_r,
t7, cnst_r,
t4,
t5,
t6,
t7);
397 ADD4(
t12, cnst_r, t13, cnst_r, t14, cnst_r,
t15, cnst_r,
407 cnst_para0 = __msa_fill_h(para_value[hmode - 1][0]);
408 cnst_para1 = __msa_fill_h(para_value[hmode - 1][1]);
409 cnst_para2 = __msa_fill_h(para_value[hmode - 1][2]);
410 cnst_para3 = __msa_fill_h(para_value[hmode - 1][3]);
412 cnst_r = __msa_fill_h(
r);
414 t0 = cnst_para1 *
t1 + cnst_para2 *
t2 - cnst_para0 *
t0 - cnst_para3 *
t3;
415 t1 = cnst_para1 *
t2 + cnst_para2 *
t3 - cnst_para0 *
t1 - cnst_para3 *
t4;
416 t2 = cnst_para1 *
t3 + cnst_para2 *
t4 - cnst_para0 *
t2 - cnst_para3 *
t5;
417 t3 = cnst_para1 *
t4 + cnst_para2 *
t5 - cnst_para0 *
t3 - cnst_para3 *
t6;
418 t4 = cnst_para1 *
t5 + cnst_para2 *
t6 - cnst_para0 *
t4 - cnst_para3 *
t7;
419 t5 = cnst_para1 *
t6 + cnst_para2 *
t7 - cnst_para0 *
t5 - cnst_para3 *
t8;
420 t6 = cnst_para1 *
t7 + cnst_para2 *
t8 - cnst_para0 *
t6 - cnst_para3 *
t9;
421 t7 = cnst_para1 *
t8 + cnst_para2 *
t9 - cnst_para0 *
t7 - cnst_para3 *
t10;
422 ADD4(
t0, cnst_r,
t1, cnst_r,
t2, cnst_r,
t3, cnst_r,
t0,
t1,
t2,
t3);
423 ADD4(
t4, cnst_r,
t5, cnst_r,
t6, cnst_r,
t7, cnst_r,
t4,
t5,
t6,
t7);
424 t0 >>= 7,
t1 >>= 7,
t2 >>= 7,
t3 >>= 7;
425 t4 >>= 7,
t5 >>= 7,
t6 >>= 7,
t7 >>= 7;
429 PCKEV_B4_SH(
t1,
t0,
t3,
t2,
t5,
t4,
t7,
t6,
t0,
t1,
t2,
t3);
430 ST_D8(
t0,
t1,
t2,
t3, 0, 1, 0, 1, 0, 1, 0, 1, dst,
stride);
433 #define PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \
434 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \
435 const uint8_t *src, \
436 ptrdiff_t stride, int rnd) \
438 put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
440 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \
441 const uint8_t *src, \
442 ptrdiff_t stride, int rnd) \
444 put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
445 put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \
446 dst += 8 * stride, src += 8 * stride; \
447 put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
448 put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \