29 unsigned A1,
unsigned A2,
30 const void *_r,
const void *_g,
const void *_b,
int y,
35 uint32_t *dest = (uint32_t *) _dest;
36 const uint32_t *
r = (
const uint32_t *) _r;
37 const uint32_t *
g = (
const uint32_t *) _g;
38 const uint32_t *
b = (
const uint32_t *) _b;
41 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
42 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
44 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
49 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
50 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
53 uint8_t *dest = (uint8_t *) _dest;
54 const uint8_t *
r = (
const uint8_t *) _r;
55 const uint8_t *
g = (
const uint8_t *) _g;
56 const uint8_t *
b = (
const uint8_t *) _b;
58 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
59 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
61 dest[
i * 6 + 0] =
r_b[Y1];
62 dest[
i * 6 + 1] =
g[Y1];
63 dest[
i * 6 + 2] =
b_r[Y1];
64 dest[
i * 6 + 3] =
r_b[Y2];
65 dest[
i * 6 + 4] =
g[Y2];
66 dest[
i * 6 + 5] =
b_r[Y2];
72 uint16_t *dest = (uint16_t *) _dest;
73 const uint16_t *
r = (
const uint16_t *) _r;
74 const uint16_t *
g = (
const uint16_t *) _g;
75 const uint16_t *
b = (
const uint16_t *) _b;
76 int dr1, dg1, db1, dr2, dg2, db2;
101 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
102 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
104 uint8_t *dest = (uint8_t *) _dest;
105 const uint8_t *
r = (
const uint8_t *) _r;
106 const uint8_t *
g = (
const uint8_t *) _g;
107 const uint8_t *
b = (
const uint8_t *) _b;
108 int dr1, dg1, db1, dr2, dg2, db2;
113 dr1 = dg1 = d32[(
i * 2 + 0) & 7];
114 db1 = d64[(
i * 2 + 0) & 7];
115 dr2 = dg2 = d32[(
i * 2 + 1) & 7];
116 db2 = d64[(
i * 2 + 1) & 7];
120 dr1 = db1 =
d128[(
i * 2 + 0) & 7];
121 dg1 = d64[(
i * 2 + 0) & 7];
122 dr2 = db2 =
d128[(
i * 2 + 1) & 7];
123 dg2 = d64[(
i * 2 + 1) & 7];
127 dest[
i] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1] +
128 ((
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2]) << 4);
130 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
131 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
136 #define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
138 Y1 = __lsx_vpickve2gr_w(vec_y1, t1); \
139 Y2 = __lsx_vpickve2gr_w(vec_y2, t2); \
140 U = __lsx_vpickve2gr_w(vec_u, t3); \
141 V = __lsx_vpickve2gr_w(vec_v, t4); \
142 r = c->table_rV[V]; \
143 g = (c->table_gU[U] + c->table_gV[V]); \
144 b = c->table_bU[U]; \
145 yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
146 r, g, b, y, target, 0); \
152 const int16_t **lumSrc,
int lumFilterSize,
153 const int16_t *chrFilter,
const int16_t **chrUSrc,
154 const int16_t **chrVSrc,
int chrFilterSize,
155 const int16_t **alpSrc, uint8_t *dest,
int dstW,
163 int len_count = (dstW + 1) >> 1;
164 const void *
r, *
g, *
b;
166 __m128i
headroom = __lsx_vreplgr2vr_w(head);
168 for (
i = 0;
i <
len;
i++) {
169 int Y1, Y2,
U,
V, count_lum = count << 1;
170 __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
171 __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
172 __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2,
temp;
174 yl_ev = __lsx_vldrepl_w(&t, 0);
192 for (j = 0; j < lumFilterSize; j++) {
193 temp = __lsx_vldrepl_h((lumFilter + j), 0);
194 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
196 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
198 yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1,
temp, l_src1);
199 yl_od1 = __lsx_vmaddwod_w_h(yl_od1,
temp, l_src1);
200 yh_ev1 = __lsx_vmaddwev_w_h(yh_ev1,
temp, l_src3);
201 yh_od1 = __lsx_vmaddwod_w_h(yh_od1,
temp, l_src3);
202 yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2,
temp, l_src2);
203 yl_od2 = __lsx_vmaddwod_w_h(yl_od2,
temp, l_src2);
204 yh_ev2 = __lsx_vmaddwev_w_h(yh_ev2,
temp, l_src4);
205 yh_od2 = __lsx_vmaddwod_w_h(yh_od2,
temp, l_src4);
207 for (j = 0; j < chrFilterSize; j++) {
208 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
210 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
212 temp = __lsx_vldrepl_h((chrFilter + j), 0);
213 u_ev1 = __lsx_vmaddwev_w_h(u_ev1,
temp, u_src1);
214 u_od1 = __lsx_vmaddwod_w_h(u_od1,
temp, u_src1);
215 v_ev1 = __lsx_vmaddwev_w_h(v_ev1,
temp, v_src1);
216 v_od1 = __lsx_vmaddwod_w_h(v_od1,
temp, v_src1);
217 u_ev2 = __lsx_vmaddwev_w_h(u_ev2,
temp, u_src2);
218 u_od2 = __lsx_vmaddwod_w_h(u_od2,
temp, u_src2);
219 v_ev2 = __lsx_vmaddwev_w_h(v_ev2,
temp, v_src2);
220 v_od2 = __lsx_vmaddwod_w_h(v_od2,
temp, v_src2);
222 yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
223 yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
224 yl_od1 = __lsx_vsrai_w(yl_od1, 19);
225 yh_od1 = __lsx_vsrai_w(yh_od1, 19);
226 u_ev1 = __lsx_vsrai_w(u_ev1, 19);
227 v_ev1 = __lsx_vsrai_w(v_ev1, 19);
228 u_od1 = __lsx_vsrai_w(u_od1, 19);
229 v_od1 = __lsx_vsrai_w(v_od1, 19);
230 yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
231 yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
232 yl_od2 = __lsx_vsrai_w(yl_od2, 19);
233 yh_od2 = __lsx_vsrai_w(yh_od2, 19);
234 u_ev2 = __lsx_vsrai_w(u_ev2, 19);
235 v_ev2 = __lsx_vsrai_w(v_ev2, 19);
236 u_od2 = __lsx_vsrai_w(u_od2, 19);
237 v_od2 = __lsx_vsrai_w(v_od2, 19);
238 u_ev1 = __lsx_vadd_w(u_ev1,
headroom);
239 v_ev1 = __lsx_vadd_w(v_ev1,
headroom);
240 u_od1 = __lsx_vadd_w(u_od1,
headroom);
241 v_od1 = __lsx_vadd_w(v_od1,
headroom);
242 u_ev2 = __lsx_vadd_w(u_ev2,
headroom);
243 v_ev2 = __lsx_vadd_w(v_ev2,
headroom);
244 u_od2 = __lsx_vadd_w(u_od2,
headroom);
245 v_od2 = __lsx_vadd_w(v_od2,
headroom);
266 int Y1, Y2,
U,
V, count_lum = count << 1;
267 __m128i l_src1, l_src2, u_src1, v_src1;
268 __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
269 __m128i u_ev1, u_od1, v_ev1, v_od1,
temp;
271 yl_ev = __lsx_vldrepl_w(&t, 0);
281 for (j = 0; j < lumFilterSize; j++) {
282 temp = __lsx_vldrepl_h((lumFilter + j), 0);
283 DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
285 yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1,
temp, l_src1);
286 yl_od1 = __lsx_vmaddwod_w_h(yl_od1,
temp, l_src1);
287 yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2,
temp, l_src2);
288 yl_od2 = __lsx_vmaddwod_w_h(yl_od2,
temp, l_src2);
290 for (j = 0; j < chrFilterSize; j++) {
291 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
293 temp = __lsx_vldrepl_h((chrFilter + j), 0);
294 u_ev1 = __lsx_vmaddwev_w_h(u_ev1,
temp, u_src1);
295 u_od1 = __lsx_vmaddwod_w_h(u_od1,
temp, u_src1);
296 v_ev1 = __lsx_vmaddwev_w_h(v_ev1,
temp, v_src1);
297 v_od1 = __lsx_vmaddwod_w_h(v_od1,
temp, v_src1);
299 yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
300 yl_od1 = __lsx_vsrai_w(yl_od1, 19);
301 u_ev1 = __lsx_vsrai_w(u_ev1, 19);
302 v_ev1 = __lsx_vsrai_w(v_ev1, 19);
303 u_od1 = __lsx_vsrai_w(u_od1, 19);
304 v_od1 = __lsx_vsrai_w(v_od1, 19);
305 yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
306 yl_od2 = __lsx_vsrai_w(yl_od2, 19);
307 u_ev1 = __lsx_vadd_w(u_ev1,
headroom);
308 v_ev1 = __lsx_vadd_w(v_ev1,
headroom);
309 u_od1 = __lsx_vadd_w(u_od1,
headroom);
310 v_od1 = __lsx_vadd_w(v_od1,
headroom);
324 int Y1, Y2,
U,
V, count_lum = count << 1;
325 __m128i l_src1, u_src, v_src;
326 __m128i yl_ev, yl_od;
327 __m128i u_ev, u_od, v_ev, v_od,
temp;
329 yl_ev = __lsx_vldrepl_w(&t, 0);
335 for (j = 0; j < lumFilterSize; j++) {
336 temp = __lsx_vldrepl_h((lumFilter + j), 0);
337 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
338 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
339 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
341 for (j = 0; j < chrFilterSize; j++) {
342 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
344 temp = __lsx_vldrepl_h((chrFilter + j), 0);
345 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
346 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
347 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
348 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
350 yl_ev = __lsx_vsrai_w(yl_ev, 19);
351 yl_od = __lsx_vsrai_w(yl_od, 19);
352 u_ev = __lsx_vsrai_w(u_ev, 19);
353 v_ev = __lsx_vsrai_w(v_ev, 19);
354 u_od = __lsx_vsrai_w(u_od, 19);
355 v_od = __lsx_vsrai_w(v_od, 19);
356 u_ev = __lsx_vadd_w(u_ev,
headroom);
357 v_ev = __lsx_vadd_w(v_ev,
headroom);
358 u_od = __lsx_vadd_w(u_od,
headroom);
359 v_od = __lsx_vadd_w(v_od,
headroom);
368 int Y1, Y2,
U,
V, count_lum = count << 1;
369 __m128i l_src1, u_src, v_src;
370 __m128i yl_ev, yl_od;
371 __m128i u_ev, u_od, v_ev, v_od,
temp;
373 yl_ev = __lsx_vldrepl_w(&t, 0);
379 for (j = 0; j < lumFilterSize; j++) {
380 temp = __lsx_vldrepl_h((lumFilter + j), 0);
381 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
382 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
383 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
385 for (j = 0; j < chrFilterSize; j++) {
386 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
388 temp = __lsx_vldrepl_h((chrFilter + j), 0);
389 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
390 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
391 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
392 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
394 yl_ev = __lsx_vsrai_w(yl_ev, 19);
395 yl_od = __lsx_vsrai_w(yl_od, 19);
396 u_ev = __lsx_vsrai_w(u_ev, 19);
397 v_ev = __lsx_vsrai_w(v_ev, 19);
398 u_od = __lsx_vsrai_w(u_od, 19);
399 v_od = __lsx_vsrai_w(v_od, 19);
400 u_ev = __lsx_vadd_w(u_ev,
headroom);
401 v_ev = __lsx_vadd_w(v_ev,
headroom);
402 u_od = __lsx_vadd_w(u_od,
headroom);
403 v_od = __lsx_vadd_w(v_od,
headroom);
410 int Y1, Y2,
U,
V, count_lum = count << 1;
411 __m128i l_src1, u_src, v_src;
412 __m128i yl_ev, yl_od;
413 __m128i u_ev, u_od, v_ev, v_od,
temp;
415 yl_ev = __lsx_vldrepl_w(&t, 0);
421 for (j = 0; j < lumFilterSize; j++) {
422 temp = __lsx_vldrepl_h((lumFilter + j), 0);
423 l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
424 yl_ev = __lsx_vmaddwev_w_h(yl_ev,
temp, l_src1);
425 yl_od = __lsx_vmaddwod_w_h(yl_od,
temp, l_src1);
427 for (j = 0; j < chrFilterSize; j++) {
428 DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
430 temp = __lsx_vldrepl_h((chrFilter + j), 0);
431 u_ev = __lsx_vmaddwev_w_h(u_ev,
temp, u_src);
432 u_od = __lsx_vmaddwod_w_h(u_od,
temp, u_src);
433 v_ev = __lsx_vmaddwev_w_h(v_ev,
temp, v_src);
434 v_od = __lsx_vmaddwod_w_h(v_od,
temp, v_src);
436 yl_ev = __lsx_vsrai_w(yl_ev, 19);
437 yl_od = __lsx_vsrai_w(yl_od, 19);
438 u_ev = __lsx_vsrai_w(u_ev, 19);
439 v_ev = __lsx_vsrai_w(v_ev, 19);
440 u_od = __lsx_vsrai_w(u_od, 19);
441 v_od = __lsx_vsrai_w(v_od, 19);
442 u_ev = __lsx_vadd_w(u_ev,
headroom);
443 v_ev = __lsx_vadd_w(v_ev,
headroom);
444 u_od = __lsx_vadd_w(u_od,
headroom);
445 v_od = __lsx_vadd_w(v_od,
headroom);
450 for (; count < len_count; count++) {
456 for (j = 0; j < lumFilterSize; j++) {
457 Y1 += lumSrc[j][count * 2] * lumFilter[j];
458 Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
460 for (j = 0; j < chrFilterSize; j++) {
461 U += chrUSrc[j][count] * chrFilter[j];
462 V += chrVSrc[j][count] * chrFilter[j];
474 r,
g,
b, y, target, 0);
480 const int16_t *ubuf[2],
const int16_t *vbuf[2],
481 const int16_t *abuf[2], uint8_t *dest,
int dstW,
482 int yalpha,
int uvalpha,
int y,
485 const int16_t *buf0 = buf[0], *buf1 = buf[1],
486 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
487 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
488 int yalpha1 = 4096 - yalpha;
489 int uvalpha1 = 4096 - uvalpha;
492 int len_count = (dstW + 1) >> 1;
493 const void *
r, *
g, *
b;
495 __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
496 __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
497 __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
498 __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
499 __m128i
headroom = __lsx_vreplgr2vr_w(head);
500 __m128i
zero = __lsx_vldi(0);
502 for (
i = 0;
i <
len;
i += 8) {
505 int c_dex = count << 1;
506 __m128i y0_h, y0_l, y0, u0, v0;
507 __m128i y1_h, y1_l, y1, u1, v1;
508 __m128i y_l, y_h,
u, v;
510 DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
511 buf1, i_dex, y0, u0, v0, y1);
512 DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
513 DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
514 DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
517 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
518 y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
519 u0 = __lsx_vmul_w(u0, v_uvalpha1);
520 v0 = __lsx_vmul_w(v0, v_uvalpha1);
521 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
522 y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
523 u = __lsx_vmadd_w(u0, v_uvalpha, u1);
524 v = __lsx_vmadd_w(v0, v_uvalpha, v1);
525 y_l = __lsx_vsrai_w(y_l, 19);
526 y_h = __lsx_vsrai_w(y_h, 19);
527 u = __lsx_vsrai_w(
u, 19);
528 v = __lsx_vsrai_w(v, 19);
539 __m128i y0_l, y0, u0, v0;
540 __m128i y1_l, y1, u1, v1;
543 y0 = __lsx_vldx(buf0, i_dex);
544 u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
545 v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
546 y1 = __lsx_vldx(buf1, i_dex);
547 u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
548 v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
552 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
553 u0 = __lsx_vmul_w(u0, v_uvalpha1);
554 v0 = __lsx_vmul_w(v0, v_uvalpha1);
555 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
556 u = __lsx_vmadd_w(u0, v_uvalpha, u1);
557 v = __lsx_vmadd_w(v0, v_uvalpha, v1);
558 y_l = __lsx_vsrai_w(y_l, 19);
559 u = __lsx_vsrai_w(
u, 19);
560 v = __lsx_vsrai_w(v, 19);
567 for (; count < len_count; count++) {
568 int Y1 = (buf0[count * 2] * yalpha1 +
569 buf1[count * 2] * yalpha) >> 19;
570 int Y2 = (buf0[count * 2 + 1] * yalpha1 +
571 buf1[count * 2 + 1] * yalpha) >> 19;
572 int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
573 int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
581 r,
g,
b, y, target, 0);
587 const int16_t *ubuf[2],
const int16_t *vbuf[2],
588 const int16_t *abuf0, uint8_t *dest,
int dstW,
592 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
594 int len = (dstW - 7);
595 int len_count = (dstW + 1) >> 1;
596 const void *
r, *
g, *
b;
601 __m128i
headroom = __lsx_vreplgr2vr_h(head);
603 for (
i = 0;
i <
len;
i += 8) {
606 int c_dex = count << 1;
607 __m128i src_y, src_u, src_v;
608 __m128i
u, v, uv, y_l, y_h;
610 src_y = __lsx_vldx(buf0, i_dex);
611 DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
612 src_y = __lsx_vsrari_h(src_y, 7);
613 src_u = __lsx_vsrari_h(src_u, 7);
614 src_v = __lsx_vsrari_h(src_v, 7);
615 y_l = __lsx_vsllwil_w_h(src_y, 0);
616 y_h = __lsx_vexth_w_h(src_y);
617 uv = __lsx_vilvl_h(src_v, src_u);
619 v = __lsx_vaddwod_w_h(uv,
headroom);
628 __m128i src_y, src_u, src_v;
629 __m128i y_l,
u, v, uv;
631 src_y = __lsx_vldx(buf0, i_dex);
632 src_u = __lsx_vldrepl_d((ubuf0 + count), 0);
633 src_v = __lsx_vldrepl_d((vbuf0 + count), 0);
634 y_l = __lsx_vsrari_h(src_y, 7);
635 y_l = __lsx_vsllwil_w_h(y_l, 0);
636 uv = __lsx_vilvl_h(src_v, src_u);
637 uv = __lsx_vsrari_h(uv, 7);
639 v = __lsx_vaddwod_w_h(uv,
headroom);
644 for (; count < len_count; count++) {
645 int Y1 = (buf0[count * 2 ] + 64) >> 7;
646 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
647 int U = (ubuf0[count] + 64) >> 7;
648 int V = (vbuf0[count] + 64) >> 7;
656 r,
g,
b, y, target, 0);
659 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
662 int uvalpha1 = 4096 - uvalpha;
663 __m128i
headroom = __lsx_vreplgr2vr_w(HEADROOM);
664 __m128i uvalpha_tmp1 = __lsx_vreplgr2vr_h(uvalpha1);
665 __m128i uvalpha_tmp = __lsx_vreplgr2vr_h(uvalpha);
667 for (
i = 0;
i <
len;
i += 8) {
670 int c_dex = count << 1;
671 __m128i src_y, src_u0, src_v0, src_u1, src_v1;
672 __m128i y_l, y_h, u1, u2, v1, v2, u_ev, v_od;
674 DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
675 ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
676 src_v1 = __lsx_vldx(vbuf1, c_dex);
677 src_y = __lsx_vsrari_h(src_y, 7);
679 u_ev = __lsx_vmulwev_w_h(src_u0, uvalpha_tmp1);
680 v_od = __lsx_vmulwod_w_h(src_u0, uvalpha_tmp1);
681 u1 = __lsx_vmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
682 v1 = __lsx_vmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
683 u_ev = __lsx_vmulwev_w_h(src_v0, uvalpha_tmp1);
684 v_od = __lsx_vmulwod_w_h(src_v0, uvalpha_tmp1);
685 u2 = __lsx_vmaddwev_w_h(u_ev, src_v1, uvalpha_tmp);
686 v2 = __lsx_vmaddwod_w_h(v_od, src_v1, uvalpha_tmp);
688 y_l = __lsx_vsllwil_w_h(src_y, 0);
689 y_h = __lsx_vexth_w_h(src_y);
690 u1 = __lsx_vsrari_w(u1, 19);
691 v1 = __lsx_vsrari_w(v1, 19);
692 u2 = __lsx_vsrari_w(u2, 19);
693 v2 = __lsx_vsrari_w(v2, 19);
703 for (; count < len_count; count++) {
704 int Y1 = (buf0[count * 2 ] + 64) >> 7;
705 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
706 int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
707 int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
715 r,
g,
b, y, target, 0);
720 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
721 static void name ## ext ## _X_lsx(SwsInternal *c, const int16_t *lumFilter, \
722 const int16_t **lumSrc, int lumFilterSize, \
723 const int16_t *chrFilter, const int16_t **chrUSrc, \
724 const int16_t **chrVSrc, int chrFilterSize, \
725 const int16_t **alpSrc, uint8_t *dest, int dstW, \
728 name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize, \
729 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
730 alpSrc, dest, dstW, y, fmt, hasAlpha); \
733 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
734 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
735 static void name ## ext ## _2_lsx(SwsInternal *c, const int16_t *buf[2], \
736 const int16_t *ubuf[2], const int16_t *vbuf[2], \
737 const int16_t *abuf[2], uint8_t *dest, int dstW, \
738 int yalpha, int uvalpha, int y) \
740 name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest, \
741 dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
744 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
745 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
746 static void name ## ext ## _1_lsx(SwsInternal *c, const int16_t *buf0, \
747 const int16_t *ubuf[2], const int16_t *vbuf[2], \
748 const int16_t *abuf0, uint8_t *dest, int dstW, \
749 int uvalpha, int y) \
751 name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest, \
752 dstW, uvalpha, y, fmt, hasAlpha); \
757 #if CONFIG_SWSCALE_ALPHA
773 uint8_t *dest,
int i,
int R,
int A,
int G,
int B,
778 if ((
R |
G |
B) & 0xC0000000) {
786 dest[0] = hasAlpha ?
A : 255;
800 dest[3] = hasAlpha ?
A : 255;
803 dest[0] = hasAlpha ?
A : 255;
817 dest[3] = hasAlpha ?
A : 255;
826 switch (
c->opts.dither) {
833 R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
834 G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
835 B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
836 c->dither_error[0][
i] = err[0];
837 c->dither_error[1][
i] = err[1];
838 c->dither_error[2][
i] = err[2];
839 r =
R >> (isrgb8 ? 5 : 7);
840 g =
G >> (isrgb8 ? 5 : 6);
841 b =
B >> (isrgb8 ? 6 : 7);
845 err[0] =
R -
r*(isrgb8 ? 36 : 255);
846 err[1] =
G -
g*(isrgb8 ? 36 : 85);
847 err[2] =
B -
b*(isrgb8 ? 85 : 255);
852 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
871 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
891 dest[0] =
r + 2*
g + 8*
b;
893 dest[0] =
b + 2*
g + 8*
r;
895 dest[0] =
r + 8*
g + 64*
b;
897 dest[0] =
b + 4*
g + 32*
r;
904 #define YUVTORGB_SETUP_LSX \
905 int y_offset = c->yuv2rgb_y_offset; \
906 int y_coeff = c->yuv2rgb_y_coeff; \
907 int v2r_coe = c->yuv2rgb_v2r_coeff; \
908 int v2g_coe = c->yuv2rgb_v2g_coeff; \
909 int u2g_coe = c->yuv2rgb_u2g_coeff; \
910 int u2b_coe = c->yuv2rgb_u2b_coeff; \
911 __m128i offset = __lsx_vreplgr2vr_w(y_offset); \
912 __m128i coeff = __lsx_vreplgr2vr_w(y_coeff); \
913 __m128i v2r = __lsx_vreplgr2vr_w(v2r_coe); \
914 __m128i v2g = __lsx_vreplgr2vr_w(v2g_coe); \
915 __m128i u2g = __lsx_vreplgr2vr_w(u2g_coe); \
916 __m128i u2b = __lsx_vreplgr2vr_w(u2b_coe); \
918 #define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff, \
919 y_temp, v2r, v2g, u2g, u2b) \
921 y = __lsx_vsub_w(y, offset); \
922 y = __lsx_vmul_w(y, coeff); \
923 y = __lsx_vadd_w(y, y_temp); \
924 R = __lsx_vmadd_w(y, v, v2r); \
925 v = __lsx_vmadd_w(y, v, v2g); \
926 G = __lsx_vmadd_w(v, u, u2g); \
927 B = __lsx_vmadd_w(y, u, u2b); \
930 #define WRITE_FULL_A_LSX(r, g, b, a, t1, s) \
932 R = __lsx_vpickve2gr_w(r, t1); \
933 G = __lsx_vpickve2gr_w(g, t1); \
934 B = __lsx_vpickve2gr_w(b, t1); \
935 A = __lsx_vpickve2gr_w(a, t1); \
937 A = av_clip_uint8(A); \
938 yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
942 #define WRITE_FULL_LSX(r, g, b, t1, s) \
944 R = __lsx_vpickve2gr_w(r, t1); \
945 G = __lsx_vpickve2gr_w(g, t1); \
946 B = __lsx_vpickve2gr_w(b, t1); \
947 yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
953 const int16_t **lumSrc,
int lumFilterSize,
954 const int16_t *chrFilter,
const int16_t **chrUSrc,
955 const int16_t **chrVSrc,
int chrFilterSize,
956 const int16_t **alpSrc, uint8_t *dest,
960 int i, j,
B,
G,
R,
A;
964 int a_temp = 1 << 18;
966 int tempc = templ - (128 << 19);
969 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
976 for (
i = 0;
i <
len;
i += 8) {
977 __m128i l_src, u_src, v_src;
978 __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od,
temp;
979 __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
982 y_ev = y_od = __lsx_vreplgr2vr_w(templ);
983 u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
984 for (j = 0; j < lumFilterSize; j++) {
985 temp = __lsx_vldrepl_h((lumFilter + j), 0);
986 l_src = __lsx_vldx(lumSrc[j], n);
987 y_ev = __lsx_vmaddwev_w_h(y_ev, l_src,
temp);
988 y_od = __lsx_vmaddwod_w_h(y_od, l_src,
temp);
990 for (j = 0; j < chrFilterSize; j++) {
991 temp = __lsx_vldrepl_h((chrFilter + j), 0);
992 DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
995 v_src,
temp, u_ev, v_ev);
997 v_src,
temp, u_od, v_od);
999 y_ev = __lsx_vsrai_w(y_ev, 10);
1000 y_od = __lsx_vsrai_w(y_od, 10);
1001 u_ev = __lsx_vsrai_w(u_ev, 10);
1002 u_od = __lsx_vsrai_w(u_od, 10);
1003 v_ev = __lsx_vsrai_w(v_ev, 10);
1004 v_od = __lsx_vsrai_w(v_od, 10);
1006 y_temp, v2r, v2g, u2g, u2b);
1008 y_temp, v2r, v2g, u2g, u2b);
1011 __m128i a_src, a_ev, a_od;
1013 a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
1014 for (j = 0; j < lumFilterSize; j++) {
1015 temp = __lsx_vldrepl_h(lumFilter + j, 0);
1016 a_src = __lsx_vldx(alpSrc[j], n);
1017 a_ev = __lsx_vmaddwev_w_h(a_ev, a_src,
temp);
1018 a_od = __lsx_vmaddwod_w_h(a_od, a_src,
temp);
1020 a_ev = __lsx_vsrai_w(a_ev, 19);
1021 a_od = __lsx_vsrai_w(a_od, 19);
1041 if (dstW -
i >= 4) {
1042 __m128i l_src, u_src, v_src;
1043 __m128i y_ev, u_ev, v_ev, uv,
temp;
1044 __m128i R_ev, G_ev, B_ev;
1047 y_ev = __lsx_vreplgr2vr_w(templ);
1048 u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
1049 for (j = 0; j < lumFilterSize; j++) {
1050 temp = __lsx_vldrepl_h((lumFilter + j), 0);
1051 l_src = __lsx_vldx(lumSrc[j], n);
1052 l_src = __lsx_vilvl_h(l_src, l_src);
1053 y_ev = __lsx_vmaddwev_w_h(y_ev, l_src,
temp);
1055 for (j = 0; j < chrFilterSize; j++) {
1056 temp = __lsx_vldrepl_h((chrFilter + j), 0);
1057 DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1058 uv = __lsx_vilvl_h(v_src, u_src);
1059 u_ev = __lsx_vmaddwev_w_h(u_ev, uv,
temp);
1060 v_ev = __lsx_vmaddwod_w_h(v_ev, uv,
temp);
1062 y_ev = __lsx_vsrai_w(y_ev, 10);
1063 u_ev = __lsx_vsrai_w(u_ev, 10);
1064 v_ev = __lsx_vsrai_w(v_ev, 10);
1066 y_temp, v2r, v2g, u2g, u2b);
1069 __m128i a_src, a_ev;
1071 a_ev = __lsx_vreplgr2vr_w(a_temp);
1072 for (j = 0; j < lumFilterSize; j++) {
1073 temp = __lsx_vldrepl_h(lumFilter + j, 0);
1074 a_src = __lsx_vldx(alpSrc[j], n);
1075 a_src = __lsx_vilvl_h(a_src, a_src);
1076 a_ev = __lsx_vmaddwev_w_h(a_ev, a_src,
temp);
1078 a_ev = __lsx_vsrai_w(a_ev, 19);
1091 for (;
i < dstW;
i++) {
1093 int V,
U =
V = tempc;
1096 for (j = 0; j < lumFilterSize; j++) {
1097 Y += lumSrc[j][
i] * lumFilter[j];
1099 for (j = 0; j < chrFilterSize; j++) {
1100 U += chrUSrc[j][
i] * chrFilter[j];
1101 V += chrVSrc[j][
i] * chrFilter[j];
1109 for (j = 0; j < lumFilterSize; j++) {
1110 A += alpSrc[j][
i] * lumFilter[j];
1119 R = (unsigned)
Y +
V * v2r_coe;
1120 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1121 B = (unsigned)
Y +
U * u2b_coe;
1122 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1125 c->dither_error[0][
i] = err[0];
1126 c->dither_error[1][
i] = err[1];
1127 c->dither_error[2][
i] = err[2];
1132 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1133 const int16_t *abuf[2], uint8_t *dest,
int dstW,
1134 int yalpha,
int uvalpha,
int y,
1137 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1138 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1139 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1140 *abuf0 = hasAlpha ? abuf[0] :
NULL,
1141 *abuf1 = hasAlpha ? abuf[1] :
NULL;
1142 int yalpha1 = 4096 - yalpha;
1143 int uvalpha1 = 4096 - uvalpha;
1144 int uvtemp = 128 << 19;
1145 int atemp = 1 << 18;
1147 int ytemp = 1 << 21;
1152 __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
1153 __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
1154 __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
1155 __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
1156 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1157 __m128i a_bias = __lsx_vreplgr2vr_w(atemp);
1158 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
1168 for (
i = 0;
i <
len;
i += 8) {
1169 __m128i
b0,
b1, ub0, ub1, vb0, vb1;
1170 __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1171 __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1172 __m128i y_l, y_h, v_l, v_h, u_l, u_h;
1173 __m128i R_l, R_h, G_l, G_h, B_l, B_h;
1176 DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
1177 n, ubuf1, n,
b0,
b1, ub0, ub1);
1178 DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1180 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1181 u0_l, u1_l, v0_l, v1_l);
1183 DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
1184 u0_h, u1_h, v0_h, v1_h);
1185 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
1186 y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
1187 u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
1188 u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
1189 v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
1190 v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
1191 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
1192 y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
1193 u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
1194 u_h = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
1195 v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
1196 v_h = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
1197 u_l = __lsx_vsub_w(u_l, uv);
1198 u_h = __lsx_vsub_w(u_h, uv);
1199 v_l = __lsx_vsub_w(v_l, uv);
1200 v_h = __lsx_vsub_w(v_h, uv);
1201 y_l = __lsx_vsrai_w(y_l, 10);
1202 y_h = __lsx_vsrai_w(y_h, 10);
1203 u_l = __lsx_vsrai_w(u_l, 10);
1204 u_h = __lsx_vsrai_w(u_h, 10);
1205 v_l = __lsx_vsrai_w(v_l, 10);
1206 v_h = __lsx_vsrai_w(v_h, 10);
1208 y_temp, v2r, v2g, u2g, u2b);
1210 y_temp, v2r, v2g, u2g, u2b);
1213 __m128i
a0,
a1, a0_l, a0_h;
1214 __m128i a_l, a_h, a1_l, a1_h;
1219 a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
1220 a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
1221 a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
1222 a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
1223 a_l = __lsx_vsrai_w(a_l, 19);
1224 a_h = __lsx_vsrai_w(a_h, 19);
1244 if (dstW -
i >= 4) {
1245 __m128i
b0,
b1, ub0, ub1, vb0, vb1;
1246 __m128i y0_l, y1_l, u0_l;
1247 __m128i v0_l, u1_l, v1_l;
1248 __m128i y_l, u_l, v_l;
1249 __m128i R_l, G_l, B_l;
1252 DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
1253 ubuf1, n,
b0,
b1, ub0, ub1);
1254 DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
1256 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1257 u0_l, u1_l, v0_l, v1_l);
1258 y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
1259 u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
1260 v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
1261 y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
1262 u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
1263 v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
1264 u_l = __lsx_vsub_w(u_l, uv);
1265 v_l = __lsx_vsub_w(v_l, uv);
1266 y_l = __lsx_vsrai_w(y_l, 10);
1267 u_l = __lsx_vsrai_w(u_l, 10);
1268 v_l = __lsx_vsrai_w(v_l, 10);
1270 y_temp, v2r, v2g, u2g, u2b);
1273 __m128i
a0,
a1, a0_l;
1278 a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
1279 a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
1280 a_l = __lsx_vsrai_w(a_l, 19);
1293 for (;
i < dstW;
i++){
1294 int Y = ( buf0[
i] * yalpha1 + buf1[
i] * yalpha ) >> 10;
1295 int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
1296 int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
1300 A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
1308 R = (unsigned)
Y +
V * v2r_coe;
1309 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1310 B = (unsigned)
Y +
U * u2b_coe;
1311 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1314 c->dither_error[0][
i] = err[0];
1315 c->dither_error[1][
i] = err[1];
1316 c->dither_error[2][
i] = err[2];
1321 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1322 const int16_t *abuf0, uint8_t *dest,
int dstW,
1326 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1330 int ytemp = 1 << 21;
1333 __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
1339 if (uvalpha < 2048) {
1340 int uvtemp = 128 << 7;
1341 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1342 __m128i
bias = __lsx_vreplgr2vr_w(bias_int);
1344 for (
i = 0;
i <
len;
i += 8) {
1345 __m128i
b,
ub, vb, ub_l, ub_h, vb_l, vb_h;
1346 __m128i y_l, y_h, u_l, u_h, v_l, v_h;
1347 __m128i R_l, R_h, G_l, G_h, B_l, B_h;
1351 vb = __lsx_vldx(vbuf0, n);
1352 y_l = __lsx_vsllwil_w_h(
b, 2);
1353 y_h = __lsx_vexth_w_h(
b);
1354 DUP2_ARG2(__lsx_vsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1356 y_h = __lsx_vslli_w(y_h, 2);
1357 u_l = __lsx_vsub_w(ub_l, uv);
1358 u_h = __lsx_vsub_w(ub_h, uv);
1359 v_l = __lsx_vsub_w(vb_l, uv);
1360 v_h = __lsx_vsub_w(vb_h, uv);
1361 u_l = __lsx_vslli_w(u_l, 2);
1362 u_h = __lsx_vslli_w(u_h, 2);
1363 v_l = __lsx_vslli_w(v_l, 2);
1364 v_h = __lsx_vslli_w(v_h, 2);
1366 y_temp, v2r, v2g, u2g, u2b);
1368 y_temp, v2r, v2g, u2g, u2b);
1374 a_src = __lsx_vld(abuf0 +
i, 0);
1375 a_l = __lsx_vsllwil_w_h(a_src, 0);
1376 a_h = __lsx_vexth_w_h(a_src);
1377 a_l = __lsx_vadd_w(a_l,
bias);
1378 a_h = __lsx_vadd_w(a_h,
bias);
1379 a_l = __lsx_vsrai_w(a_l, 7);
1380 a_h = __lsx_vsrai_w(a_h, 7);
1400 if (dstW -
i >= 4) {
1401 __m128i
b,
ub, vb, ub_l, vb_l;
1402 __m128i y_l, u_l, v_l;
1403 __m128i R_l, G_l, B_l;
1407 vb = __lsx_vldx(vbuf0, n);
1408 y_l = __lsx_vsllwil_w_h(
b, 0);
1409 DUP2_ARG2(__lsx_vsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1410 y_l = __lsx_vslli_w(y_l, 2);
1411 u_l = __lsx_vsub_w(ub_l, uv);
1412 v_l = __lsx_vsub_w(vb_l, uv);
1413 u_l = __lsx_vslli_w(u_l, 2);
1414 v_l = __lsx_vslli_w(v_l, 2);
1416 y_temp, v2r, v2g, u2g, u2b);
1421 a_src = __lsx_vldx(abuf0, n);
1422 a_src = __lsx_vsllwil_w_h(a_src, 0);
1423 a_l = __lsx_vadd_w(
bias, a_src);
1424 a_l = __lsx_vsrai_w(a_l, 7);
1437 for (;
i < dstW;
i++) {
1438 int Y = buf0[
i] << 2;
1439 int U = (ubuf0[
i] - uvtemp) << 2;
1440 int V = (vbuf0[
i] - uvtemp) << 2;
1444 A = (abuf0[
i] + 64) >> 7;
1451 R = (unsigned)
Y +
V * v2r_coe;
1452 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1453 B = (unsigned)
Y +
U * u2b_coe;
1454 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1458 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1459 int uvtemp = 128 << 8;
1460 __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
1461 __m128i
zero = __lsx_vldi(0);
1462 __m128i
bias = __lsx_vreplgr2vr_h(bias_int);
1464 for (
i = 0;
i <
len;
i += 8) {
1465 __m128i
b, ub0, ub1, vb0, vb1;
1466 __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1467 __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1470 DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
1471 ubuf1, n,
b, ub0, vb0, ub1);
1472 vb1 = __lsx_vldx(vbuf, n);
1473 y_ev = __lsx_vaddwev_w_h(
b,
zero);
1474 y_od = __lsx_vaddwod_w_h(
b,
zero);
1475 DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1476 DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1477 DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1478 DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1479 u_ev, u_od, v_ev, v_od);
1480 DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1481 u_ev, u_od, v_ev, v_od);
1483 y_temp, v2r, v2g, u2g, u2b);
1485 y_temp, v2r, v2g, u2g, u2b);
1491 a_src = __lsx_vld(abuf0 +
i, 0);
1492 a_ev = __lsx_vaddwev_w_h(
bias, a_src);
1493 a_od = __lsx_vaddwod_w_h(
bias, a_src);
1494 a_ev = __lsx_vsrai_w(a_ev, 7);
1495 a_od = __lsx_vsrai_w(a_od, 7);
1515 if (dstW -
i >= 4) {
1516 __m128i
b, ub0, ub1, vb0, vb1;
1517 __m128i y_l, u_l, v_l;
1518 __m128i R_l, G_l, B_l;
1521 DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
1522 ubuf1, n,
b, ub0, vb0, ub1);
1523 vb1 = __lsx_vldx(vbuf1, n);
1524 y_l = __lsx_vsllwil_w_h(
b, 0);
1525 y_l = __lsx_vslli_w(y_l, 2);
1526 DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
1527 ub0, vb0, ub1, vb1);
1528 DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1529 u_l = __lsx_vsub_w(u_l, uv);
1530 v_l = __lsx_vsub_w(v_l, uv);
1531 u_l = __lsx_vslli_w(u_l, 1);
1532 v_l = __lsx_vslli_w(v_l, 1);
1534 y_temp, v2r, v2g, u2g, u2b);
1540 a_src = __lsx_vld(abuf0 +
i, 0);
1541 a_src = __lsx_vilvl_h(a_src, a_src);
1542 a_l = __lsx_vaddwev_w_h(
bias, a_l);
1543 a_l = __lsx_vsrai_w(a_l, 7);
1556 for (;
i < dstW;
i++) {
1557 int Y = buf0[
i] << 2;
1558 int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
1559 int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
1563 A = (abuf0[
i] + 64) >> 7;
1570 R = (unsigned)
Y +
V * v2r_coe;
1571 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1572 B = (unsigned)
Y +
U * u2b_coe;
1573 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1577 c->dither_error[0][
i] = err[0];
1578 c->dither_error[1][
i] = err[1];
1579 c->dither_error[2][
i] = err[2];
1584 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1586 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1588 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1590 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1592 #if CONFIG_SWSCALE_ALPHA
1625 }
else if (
is16BPS(dstFormat)) {
1626 }
else if (
isNBPS(dstFormat)) {
1635 switch (
c->opts.dst_format) {
1638 c->yuv2packedX = yuv2rgba32_full_X_lsx;
1639 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
1640 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
1642 #if CONFIG_SWSCALE_ALPHA
1644 c->yuv2packedX = yuv2rgba32_full_X_lsx;
1645 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
1646 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
1650 c->yuv2packedX = yuv2rgbx32_full_X_lsx;
1651 c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
1652 c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
1658 c->yuv2packedX = yuv2argb32_full_X_lsx;
1659 c->yuv2packed2 = yuv2argb32_full_2_lsx;
1660 c->yuv2packed1 = yuv2argb32_full_1_lsx;
1662 #if CONFIG_SWSCALE_ALPHA
1664 c->yuv2packedX = yuv2argb32_full_X_lsx;
1665 c->yuv2packed2 = yuv2argb32_full_2_lsx;
1666 c->yuv2packed1 = yuv2argb32_full_1_lsx;
1670 c->yuv2packedX = yuv2xrgb32_full_X_lsx;
1671 c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
1672 c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
1678 c->yuv2packedX = yuv2bgra32_full_X_lsx;
1679 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
1680 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
1682 #if CONFIG_SWSCALE_ALPHA
1684 c->yuv2packedX = yuv2bgra32_full_X_lsx;
1685 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
1686 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
1690 c->yuv2packedX = yuv2bgrx32_full_X_lsx;
1691 c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
1692 c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
1698 c->yuv2packedX = yuv2abgr32_full_X_lsx;
1699 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
1700 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
1702 #if CONFIG_SWSCALE_ALPHA
1704 c->yuv2packedX = yuv2abgr32_full_X_lsx;
1705 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
1706 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
1710 c->yuv2packedX = yuv2xbgr32_full_X_lsx;
1711 c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
1712 c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
1717 c->yuv2packedX = yuv2rgb24_full_X_lsx;
1718 c->yuv2packed2 = yuv2rgb24_full_2_lsx;
1719 c->yuv2packed1 = yuv2rgb24_full_1_lsx;
1722 c->yuv2packedX = yuv2bgr24_full_X_lsx;
1723 c->yuv2packed2 = yuv2bgr24_full_2_lsx;
1724 c->yuv2packed1 = yuv2bgr24_full_1_lsx;
1727 c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
1728 c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
1729 c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
1732 c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
1733 c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
1734 c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
1737 c->yuv2packedX = yuv2bgr8_full_X_lsx;
1738 c->yuv2packed2 = yuv2bgr8_full_2_lsx;
1739 c->yuv2packed1 = yuv2bgr8_full_1_lsx;
1742 c->yuv2packedX = yuv2rgb8_full_X_lsx;
1743 c->yuv2packed2 = yuv2rgb8_full_2_lsx;
1744 c->yuv2packed1 = yuv2rgb8_full_1_lsx;
1748 switch (
c->opts.dst_format) {
1753 #if CONFIG_SWSCALE_ALPHA
1758 c->yuv2packed1 = yuv2rgbx32_1_lsx;
1759 c->yuv2packed2 = yuv2rgbx32_2_lsx;
1760 c->yuv2packedX = yuv2rgbx32_X_lsx;
1768 #if CONFIG_SWSCALE_ALPHA
1773 c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
1774 c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
1775 c->yuv2packedX = yuv2rgbx32_1_X_lsx;
1780 c->yuv2packed1 = yuv2rgb24_1_lsx;
1781 c->yuv2packed2 = yuv2rgb24_2_lsx;
1782 c->yuv2packedX = yuv2rgb24_X_lsx;
1785 c->yuv2packed1 = yuv2bgr24_1_lsx;
1786 c->yuv2packed2 = yuv2bgr24_2_lsx;
1787 c->yuv2packedX = yuv2bgr24_X_lsx;
1793 c->yuv2packed1 = yuv2rgb16_1_lsx;
1794 c->yuv2packed2 = yuv2rgb16_2_lsx;
1795 c->yuv2packedX = yuv2rgb16_X_lsx;
1801 c->yuv2packed1 = yuv2rgb15_1_lsx;
1802 c->yuv2packed2 = yuv2rgb15_2_lsx;
1803 c->yuv2packedX = yuv2rgb15_X_lsx;
1809 c->yuv2packed1 = yuv2rgb12_1_lsx;
1810 c->yuv2packed2 = yuv2rgb12_2_lsx;
1811 c->yuv2packedX = yuv2rgb12_X_lsx;
1815 c->yuv2packed1 = yuv2rgb8_1_lsx;
1816 c->yuv2packed2 = yuv2rgb8_2_lsx;
1817 c->yuv2packedX = yuv2rgb8_X_lsx;
1821 c->yuv2packed1 = yuv2rgb4_1_lsx;
1822 c->yuv2packed2 = yuv2rgb4_2_lsx;
1823 c->yuv2packedX = yuv2rgb4_X_lsx;
1827 c->yuv2packed1 = yuv2rgb4b_1_lsx;
1828 c->yuv2packed2 = yuv2rgb4b_2_lsx;
1829 c->yuv2packedX = yuv2rgb4b_X_lsx;