FFmpeg
input_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 
25 void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
26  int width, int32_t *rgb2yuv, void *opq)
27 {
28  int i;
29  uint16_t *dstU = (uint16_t *)_dstU;
30  uint16_t *dstV = (uint16_t *)_dstV;
31  int set = 0x4001 << (RGB2YUV_SHIFT - 7);
32  int len = width - 15;
33  int32_t tem_ru = rgb2yuv[RU_IDX], tem_gu = rgb2yuv[GU_IDX];
34  int32_t tem_bu = rgb2yuv[BU_IDX], tem_rv = rgb2yuv[RV_IDX];
35  int32_t tem_gv = rgb2yuv[GV_IDX], tem_bv = rgb2yuv[BV_IDX];
36  int shift = RGB2YUV_SHIFT - 6;
37  const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
38  __m256i ru, gu, bu, rv, gv, bv;
39  __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
40  0x0D0C090805040100, 0x1D1C191815141110};
41  __m256i temp = __lasx_xvreplgr2vr_w(set);
42  __m256i sra = __lasx_xvreplgr2vr_w(shift);
43 
44  ru = __lasx_xvreplgr2vr_w(tem_ru);
45  gu = __lasx_xvreplgr2vr_w(tem_gu);
46  bu = __lasx_xvreplgr2vr_w(tem_bu);
47  rv = __lasx_xvreplgr2vr_w(tem_rv);
48  gv = __lasx_xvreplgr2vr_w(tem_gv);
49  bv = __lasx_xvreplgr2vr_w(tem_bv);
50  for (i = 0; i < len; i += 16) {
51  __m256i _g, _b, _r;
52  __m256i g_l, g_h, b_l, b_h, r_l, r_h;
53  __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
54 
55  _g = __lasx_xvldx(src0, i);
56  _b = __lasx_xvldx(src1, i);
57  _r = __lasx_xvldx(src2, i);
58  g_l = __lasx_vext2xv_wu_bu(_g);
59  b_l = __lasx_vext2xv_wu_bu(_b);
60  r_l = __lasx_vext2xv_wu_bu(_r);
61  _g = __lasx_xvpermi_d(_g, 0x01);
62  _b = __lasx_xvpermi_d(_b, 0x01);
63  _r = __lasx_xvpermi_d(_r, 0x01);
64  g_h = __lasx_vext2xv_wu_bu(_g);
65  b_h = __lasx_vext2xv_wu_bu(_b);
66  r_h = __lasx_vext2xv_wu_bu(_r);
67  u_l = __lasx_xvmadd_w(temp, ru, r_l);
68  u_h = __lasx_xvmadd_w(temp, ru, r_h);
69  v_l = __lasx_xvmadd_w(temp, rv, r_l);
70  v_h = __lasx_xvmadd_w(temp, rv, r_h);
71  u_l = __lasx_xvmadd_w(u_l, gu, g_l);
72  u_l = __lasx_xvmadd_w(u_l, bu, b_l);
73  u_h = __lasx_xvmadd_w(u_h, gu, g_h);
74  u_h = __lasx_xvmadd_w(u_h, bu, b_h);
75  v_l = __lasx_xvmadd_w(v_l, gv, g_l);
76  v_l = __lasx_xvmadd_w(v_l, bv, b_l);
77  v_h = __lasx_xvmadd_w(v_h, gv, g_h);
78  v_h = __lasx_xvmadd_w(v_h, bv, b_h);
79  u_l = __lasx_xvsra_w(u_l, sra);
80  u_h = __lasx_xvsra_w(u_h, sra);
81  v_l = __lasx_xvsra_w(v_l, sra);
82  v_h = __lasx_xvsra_w(v_h, sra);
83  u_lh = __lasx_xvshuf_b(u_h, u_l, mask);
84  v_lh = __lasx_xvshuf_b(v_h, v_l, mask);
85  u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
86  v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
87  __lasx_xvst(u_lh, (dstU + i), 0);
88  __lasx_xvst(v_lh, (dstV + i), 0);
89  }
90  if (width - i >= 8) {
91  __m256i _g, _b, _r;
92  __m256i g_l, b_l, r_l;
93  __m256i v_l, u_l, u, v;
94 
95  _g = __lasx_xvldrepl_d((src0 + i), 0);
96  _b = __lasx_xvldrepl_d((src1 + i), 0);
97  _r = __lasx_xvldrepl_d((src2 + i), 0);
98  g_l = __lasx_vext2xv_wu_bu(_g);
99  b_l = __lasx_vext2xv_wu_bu(_b);
100  r_l = __lasx_vext2xv_wu_bu(_r);
101  u_l = __lasx_xvmadd_w(temp, ru, r_l);
102  v_l = __lasx_xvmadd_w(temp, rv, r_l);
103  u_l = __lasx_xvmadd_w(u_l, gu, g_l);
104  u_l = __lasx_xvmadd_w(u_l, bu, b_l);
105  v_l = __lasx_xvmadd_w(v_l, gv, g_l);
106  v_l = __lasx_xvmadd_w(v_l, bv, b_l);
107  u_l = __lasx_xvsra_w(u_l, sra);
108  v_l = __lasx_xvsra_w(v_l, sra);
109  u = __lasx_xvshuf_b(u_l, u_l, mask);
110  v = __lasx_xvshuf_b(v_l, v_l, mask);
111  __lasx_xvstelm_d(u, (dstU + i), 0, 0);
112  __lasx_xvstelm_d(u, (dstU + i), 8, 2);
113  __lasx_xvstelm_d(v, (dstV + i), 0, 0);
114  __lasx_xvstelm_d(v, (dstV + i), 8, 2);
115  i += 8;
116  }
117  for (; i < width; i++) {
118  int g = src[0][i];
119  int b = src[1][i];
120  int r = src[2][i];
121 
122  dstU[i] = (tem_ru * r + tem_gu * g + tem_bu * b + set) >> shift;
123  dstV[i] = (tem_rv * r + tem_gv * g + tem_bv * b + set) >> shift;
124  }
125 }
126 
127 void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
128  int32_t *rgb2yuv, void *opq)
129 {
130  int i;
131  int shift = (RGB2YUV_SHIFT - 6);
132  int set = 0x801 << (RGB2YUV_SHIFT - 7);
133  int len = width - 15;
134  uint16_t *dst = (uint16_t *)_dst;
135  int32_t tem_ry = rgb2yuv[RY_IDX], tem_gy = rgb2yuv[GY_IDX];
136  int32_t tem_by = rgb2yuv[BY_IDX];
137  const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
138  __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
139  0x0D0C090805040100, 0x1D1C191815141110};
140  __m256i temp = __lasx_xvreplgr2vr_w(set);
141  __m256i sra = __lasx_xvreplgr2vr_w(shift);
142  __m256i ry = __lasx_xvreplgr2vr_w(tem_ry);
143  __m256i gy = __lasx_xvreplgr2vr_w(tem_gy);
144  __m256i by = __lasx_xvreplgr2vr_w(tem_by);
145 
146  for (i = 0; i < len; i += 16) {
147  __m256i _g, _b, _r;
148  __m256i g_l, g_h, b_l, b_h, r_l, r_h;
149  __m256i y_l, y_h, y_lh;
150 
151  _g = __lasx_xvldx(src0, i);
152  _b = __lasx_xvldx(src1, i);
153  _r = __lasx_xvldx(src2, i);
154  g_l = __lasx_vext2xv_wu_bu(_g);
155  b_l = __lasx_vext2xv_wu_bu(_b);
156  r_l = __lasx_vext2xv_wu_bu(_r);
157  _g = __lasx_xvpermi_d(_g, 0x01);
158  _b = __lasx_xvpermi_d(_b, 0x01);
159  _r = __lasx_xvpermi_d(_r, 0x01);
160  g_h = __lasx_vext2xv_wu_bu(_g);
161  b_h = __lasx_vext2xv_wu_bu(_b);
162  r_h = __lasx_vext2xv_wu_bu(_r);
163  y_l = __lasx_xvmadd_w(temp, ry, r_l);
164  y_h = __lasx_xvmadd_w(temp, ry, r_h);
165  y_l = __lasx_xvmadd_w(y_l, gy, g_l);
166  y_l = __lasx_xvmadd_w(y_l, by, b_l);
167  y_h = __lasx_xvmadd_w(y_h, gy, g_h);
168  y_h = __lasx_xvmadd_w(y_h, by, b_h);
169  y_l = __lasx_xvsra_w(y_l, sra);
170  y_h = __lasx_xvsra_w(y_h, sra);
171  y_lh = __lasx_xvshuf_b(y_h, y_l, mask);
172  y_lh = __lasx_xvpermi_d(y_lh, 0xD8);
173  __lasx_xvst(y_lh, (dst + i), 0);
174  }
175  if (width - i >= 8) {
176  __m256i _g, _b, _r;
177  __m256i g_l, b_l, r_l;
178  __m256i y_l, y;
179 
180  _g = __lasx_xvldrepl_d((src0 + i), 0);
181  _b = __lasx_xvldrepl_d((src1 + i), 0);
182  _r = __lasx_xvldrepl_d((src2 + i), 0);
183  g_l = __lasx_vext2xv_wu_bu(_g);
184  b_l = __lasx_vext2xv_wu_bu(_b);
185  r_l = __lasx_vext2xv_wu_bu(_r);
186  y_l = __lasx_xvmadd_w(temp, ry, r_l);
187  y_l = __lasx_xvmadd_w(y_l, gy, g_l);
188  y_l = __lasx_xvmadd_w(y_l, by, b_l);
189  y_l = __lasx_xvsra_w(y_l, sra);
190  y = __lasx_xvshuf_b(y_l, y_l, mask);
191  __lasx_xvstelm_d(y, (dst + i), 0, 0);
192  __lasx_xvstelm_d(y, (dst + i), 8, 2);
193  i += 8;
194  }
195  for (; i < width; i++) {
196  int g = src[0][i];
197  int b = src[1][i];
198  int r = src[2][i];
199 
200  dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
201  }
202 }
203 
205 {
206  enum AVPixelFormat srcFormat = c->srcFormat;
207 
208  switch (srcFormat) {
209  case AV_PIX_FMT_YUYV422:
210  c->chrToYV12 = yuy2ToUV_lasx;
211  break;
212  case AV_PIX_FMT_YVYU422:
213  c->chrToYV12 = yvy2ToUV_lasx;
214  break;
215  case AV_PIX_FMT_UYVY422:
216  c->chrToYV12 = uyvyToUV_lasx;
217  break;
218  case AV_PIX_FMT_NV12:
219  case AV_PIX_FMT_NV16:
220  case AV_PIX_FMT_NV24:
221  c->chrToYV12 = nv12ToUV_lasx;
222  break;
223  case AV_PIX_FMT_NV21:
224  case AV_PIX_FMT_NV42:
225  c->chrToYV12 = nv21ToUV_lasx;
226  break;
227  case AV_PIX_FMT_GBRAP:
228  case AV_PIX_FMT_GBRP:
229  c->readChrPlanar = planar_rgb_to_uv_lasx;
230  break;
231  }
232 
233  if (c->needAlpha) {
234  switch (srcFormat) {
235  case AV_PIX_FMT_BGRA:
236  case AV_PIX_FMT_RGBA:
237  c->alpToYV12 = rgbaToA_lasx;
238  break;
239  case AV_PIX_FMT_ABGR:
240  case AV_PIX_FMT_ARGB:
241  c->alpToYV12 = abgrToA_lasx;
242  break;
243  }
244  }
245 }
_dst
uint8_t * _dst
Definition: dsp.h:52
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
r
const char * r
Definition: vf_curves.c:127
u
#define u(width, name, range_min, range_max)
Definition: cbs_h2645.c:251
src1
const pixel * src1
Definition: h264pred_template.c:421
mask
int mask
Definition: mediacodecdec_common.c:154
RV_IDX
#define RV_IDX
Definition: swscale_internal.h:476
RU_IDX
#define RU_IDX
Definition: swscale_internal.h:473
b
#define b
Definition: input.c:41
GV_IDX
#define GV_IDX
Definition: swscale_internal.h:477
rgb2yuv
static const char rgb2yuv[]
Definition: vf_scale_vulkan.c:69
BV_IDX
#define BV_IDX
Definition: swscale_internal.h:478
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
AV_PIX_FMT_GBRAP
@ AV_PIX_FMT_GBRAP
planar GBRA 4:4:4:4 32bpp
Definition: pixfmt.h:212
swscale_loongarch.h
av_cold
#define av_cold
Definition: attributes.h:90
set
static void set(uint8_t *a[], int ch, int index, int ch_count, enum AVSampleFormat f, double v)
Definition: swresample.c:59
g
const char * g
Definition: vf_curves.c:128
ff_sws_init_input_lasx
av_cold void ff_sws_init_input_lasx(SwsInternal *c)
Definition: input_lasx.c:204
GY_IDX
#define GY_IDX
Definition: swscale_internal.h:471
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
AV_PIX_FMT_YUYV422
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:74
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RY_IDX
#define RY_IDX
Definition: swscale_internal.h:470
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
AV_PIX_FMT_NV16
@ AV_PIX_FMT_NV16
interleaved chroma YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:198
RGB2YUV_SHIFT
#define RGB2YUV_SHIFT
BY_IDX
#define BY_IDX
Definition: swscale_internal.h:472
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
src2
const pixel * src2
Definition: h264pred_template.c:422
AV_PIX_FMT_NV24
@ AV_PIX_FMT_NV24
planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:371
planar_rgb_to_uv_lasx
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
Definition: input_lasx.c:25
AV_PIX_FMT_NV21
@ AV_PIX_FMT_NV21
as above, but U and V bytes are swapped
Definition: pixfmt.h:97
len
int len
Definition: vorbis_enc_data.h:426
AV_PIX_FMT_NV42
@ AV_PIX_FMT_NV42
as above, but U and V bytes are swapped
Definition: pixfmt.h:372
AV_PIX_FMT_YVYU422
@ AV_PIX_FMT_YVYU422
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:207
SwsInternal
Definition: swscale_internal.h:330
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:96
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:88
temp
else temp
Definition: vf_mcdeint.c:263
src0
const pixel *const src0
Definition: h264pred_template.c:420
AV_PIX_FMT_GBRP
@ AV_PIX_FMT_GBRP
planar GBR 4:4:4 24bpp
Definition: pixfmt.h:165
loongson_intrinsics.h
BU_IDX
#define BU_IDX
Definition: swscale_internal.h:475
int32_t
int32_t
Definition: audioconvert.c:56
planar_rgb_to_y_lasx
void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq)
Definition: input_lasx.c:127
GU_IDX
#define GU_IDX
Definition: swscale_internal.h:474
width
#define width
Definition: dsp.h:85
src
#define src
Definition: vp8dsp.c:248