FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include <libavutil/avassert.h>
24 #include <libavutil/mem.h>
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
62  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
63  .op = SWS_OP_##OP, \
64  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
65  );
66 
67 #define DECL_PACKED_RW(EXT, DEPTH) \
68  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
69  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
70  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
71  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
72  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
73  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
74 
75 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
76  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
77  .op = SWS_OP_PACK, \
78  .pack.pattern = {X, Y, Z, W}, \
79  ); \
80  \
81  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
82  .op = SWS_OP_UNPACK, \
83  .pack.pattern = {X, Y, Z, W}, \
84  ); \
85 
86 static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87 {
88  const int mask = ff_sws_pixel_type_size(op->type) - 1;
89  for (int i = 0; i < 16; i++)
90  out->u8[i] = (i & ~mask) | (mask - (i & mask));
91  return 0;
92 }
93 
94 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
95  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
96  .op = SWS_OP_SWAP_BYTES, \
97  .unused = { !X, !Y, !Z, !W }, \
98  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
99  .setup = setup_swap_bytes, \
100  );
101 
102 #define DECL_CLEAR_ALPHA(EXT, IDX) \
103  DECL_ASM(U8, clear_alpha##IDX##EXT, \
104  .op = SWS_OP_CLEAR, \
105  .clear_value = -1, \
106  .unused[IDX] = true, \
107  ); \
108 
109 #define DECL_CLEAR_ZERO(EXT, IDX) \
110  DECL_ASM(U8, clear_zero##IDX##EXT, \
111  .op = SWS_OP_CLEAR, \
112  .clear_value = 0, \
113  .unused[IDX] = true, \
114  );
115 
116 static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117 {
118  for (int i = 0; i < 4; i++)
119  out->u32[i] = (uint32_t) op->c.q4[i].num;
120  return 0;
121 }
122 
123 #define DECL_CLEAR(EXT, X, Y, Z, W) \
124  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
125  .op = SWS_OP_CLEAR, \
126  .setup = setup_clear, \
127  .flexible = true, \
128  );
129 
130 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
131  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
132  .op = SWS_OP_SWIZZLE, \
133  .swizzle.in = {X, Y, Z, W}, \
134  );
135 
136 #define DECL_CONVERT(EXT, FROM, TO) \
137  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
138  .op = SWS_OP_CONVERT, \
139  .convert.to = SWS_PIXEL_##TO, \
140  );
141 
142 #define DECL_EXPAND(EXT, FROM, TO) \
143  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
144  .op = SWS_OP_CONVERT, \
145  .convert.to = SWS_PIXEL_##TO, \
146  .convert.expand = true, \
147  );
148 
149 static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150 {
151  out->u16[0] = op->c.u;
152  return 0;
153 }
154 
155 #define DECL_SHIFT16(EXT) \
156  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
157  .op = SWS_OP_LSHIFT, \
158  .setup = setup_shift, \
159  .flexible = true, \
160  ); \
161  \
162  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
163  .op = SWS_OP_RSHIFT, \
164  .setup = setup_shift, \
165  .flexible = true, \
166  );
167 
168 #define DECL_MIN_MAX(EXT) \
169  DECL_COMMON_PATTERNS(F32, min##EXT, \
170  .op = SWS_OP_MIN, \
171  .setup = ff_sws_setup_q4, \
172  .flexible = true, \
173  ); \
174  \
175  DECL_COMMON_PATTERNS(F32, max##EXT, \
176  .op = SWS_OP_MAX, \
177  .setup = ff_sws_setup_q4, \
178  .flexible = true, \
179  );
180 
181 #define DECL_SCALE(EXT) \
182  DECL_COMMON_PATTERNS(F32, scale##EXT, \
183  .op = SWS_OP_SCALE, \
184  .setup = ff_sws_setup_q, \
185  );
186 
187 /* 2x2 matrix fits inside SwsOpPriv directly; save an indirect in this case */
188 static_assert(sizeof(SwsOpPriv) >= sizeof(float[2][2]), "2x2 dither matrix too large");
189 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
190 {
191  const int size = 1 << op->dither.size_log2;
192  float *matrix = out->f32;
193  if (size > 2) {
194  matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
195  if (!matrix)
196  return AVERROR(ENOMEM);
197  }
198 
199  for (int i = 0; i < size * size; i++)
200  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
201 
202  return 0;
203 }
204 
205 #define DECL_DITHER(EXT, SIZE) \
206  DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \
207  .op = SWS_OP_DITHER, \
208  .setup = setup_dither, \
209  .free = SIZE > 2 ? av_free : NULL, \
210  .dither_size = SIZE, \
211  );
212 
213 static int setup_linear(const SwsOp *op, SwsOpPriv *out)
214 {
215  float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
216  if (!matrix)
217  return AVERROR(ENOMEM);
218 
219  for (int y = 0; y < 4; y++) {
220  for (int x = 0; x < 5; x++)
221  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
222  }
223 
224  return 0;
225 }
226 
227 #define DECL_LINEAR(EXT, NAME, MASK) \
228  DECL_ASM(F32, NAME##EXT, \
229  .op = SWS_OP_LINEAR, \
230  .setup = setup_linear, \
231  .free = av_free, \
232  .linear_mask = (MASK), \
233  );
234 
235 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
236  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
237  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
238  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
239  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
240  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
241  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
242  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
243  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
244  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
245  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
246  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
247  DECL_PACKED_RW(EXT, 8) \
248  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
249  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
250  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
251  void ff_p1000_shuffle##EXT(void); \
252  void ff_p1001_shuffle##EXT(void); \
253  void ff_p1110_shuffle##EXT(void); \
254  void ff_p1111_shuffle##EXT(void); \
255  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
256  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
257  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
258  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
259  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
260  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
261  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
262  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
263  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
264  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
265  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
266  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
267  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
268  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
269  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
270  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
271  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
272  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
273  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
274  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
275  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
276  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
277  DECL_CLEAR_ALPHA(EXT, 0) \
278  DECL_CLEAR_ALPHA(EXT, 1) \
279  DECL_CLEAR_ALPHA(EXT, 3) \
280  DECL_CLEAR_ZERO(EXT, 0) \
281  DECL_CLEAR_ZERO(EXT, 1) \
282  DECL_CLEAR_ZERO(EXT, 3) \
283  DECL_CLEAR(EXT, 1, 1, 1, 0) \
284  DECL_CLEAR(EXT, 0, 1, 1, 1) \
285  DECL_CLEAR(EXT, 0, 0, 1, 1) \
286  DECL_CLEAR(EXT, 1, 0, 0, 1) \
287  DECL_CLEAR(EXT, 1, 1, 0, 0) \
288  DECL_CLEAR(EXT, 0, 1, 0, 1) \
289  DECL_CLEAR(EXT, 1, 0, 1, 0) \
290  DECL_CLEAR(EXT, 1, 0, 0, 0) \
291  DECL_CLEAR(EXT, 0, 1, 0, 0) \
292  DECL_CLEAR(EXT, 0, 0, 1, 0) \
293  \
294 static const SwsOpTable ops8##EXT = { \
295  .cpu_flags = AV_CPU_FLAG_##FLAG, \
296  .block_size = SIZE, \
297  .entries = { \
298  &op_read_planar1##EXT, \
299  &op_read_planar2##EXT, \
300  &op_read_planar3##EXT, \
301  &op_read_planar4##EXT, \
302  &op_write_planar1##EXT, \
303  &op_write_planar2##EXT, \
304  &op_write_planar3##EXT, \
305  &op_write_planar4##EXT, \
306  &op_read8_packed2##EXT, \
307  &op_read8_packed3##EXT, \
308  &op_read8_packed4##EXT, \
309  &op_write8_packed2##EXT, \
310  &op_write8_packed3##EXT, \
311  &op_write8_packed4##EXT, \
312  &op_read_nibbles1##EXT, \
313  &op_read_bits1##EXT, \
314  &op_write_bits1##EXT, \
315  &op_pack_1210##EXT, \
316  &op_pack_3320##EXT, \
317  &op_pack_2330##EXT, \
318  &op_unpack_1210##EXT, \
319  &op_unpack_3320##EXT, \
320  &op_unpack_2330##EXT, \
321  &op_swizzle_3012##EXT, \
322  &op_swizzle_3021##EXT, \
323  &op_swizzle_2103##EXT, \
324  &op_swizzle_3210##EXT, \
325  &op_swizzle_3102##EXT, \
326  &op_swizzle_3201##EXT, \
327  &op_swizzle_1203##EXT, \
328  &op_swizzle_1023##EXT, \
329  &op_swizzle_2013##EXT, \
330  &op_swizzle_2310##EXT, \
331  &op_swizzle_2130##EXT, \
332  &op_swizzle_1230##EXT, \
333  &op_swizzle_1320##EXT, \
334  &op_swizzle_0213##EXT, \
335  &op_swizzle_0231##EXT, \
336  &op_swizzle_0312##EXT, \
337  &op_swizzle_3120##EXT, \
338  &op_swizzle_0321##EXT, \
339  &op_swizzle_0003##EXT, \
340  &op_swizzle_0001##EXT, \
341  &op_swizzle_3000##EXT, \
342  &op_swizzle_1000##EXT, \
343  &op_clear_alpha0##EXT, \
344  &op_clear_alpha1##EXT, \
345  &op_clear_alpha3##EXT, \
346  &op_clear_zero0##EXT, \
347  &op_clear_zero1##EXT, \
348  &op_clear_zero3##EXT, \
349  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
350  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
351  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
352  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
353  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
354  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
355  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
356  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
357  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
358  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
359  NULL \
360  }, \
361 };
362 
363 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
364  DECL_PACKED_RW(EXT, 16) \
365  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
366  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
367  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
368  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
369  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
370  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
371  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
372  DECL_SHIFT16(EXT) \
373  DECL_CONVERT(EXT, U8, U16) \
374  DECL_CONVERT(EXT, U16, U8) \
375  DECL_EXPAND(EXT, U8, U16) \
376  \
377 static const SwsOpTable ops16##EXT = { \
378  .cpu_flags = AV_CPU_FLAG_##FLAG, \
379  .block_size = SIZE, \
380  .entries = { \
381  &op_read16_packed2##EXT, \
382  &op_read16_packed3##EXT, \
383  &op_read16_packed4##EXT, \
384  &op_write16_packed2##EXT, \
385  &op_write16_packed3##EXT, \
386  &op_write16_packed4##EXT, \
387  &op_pack_4440##EXT, \
388  &op_pack_5550##EXT, \
389  &op_pack_5650##EXT, \
390  &op_unpack_4440##EXT, \
391  &op_unpack_5550##EXT, \
392  &op_unpack_5650##EXT, \
393  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
394  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
395  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
396  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
397  REF_COMMON_PATTERNS(lshift16##EXT), \
398  REF_COMMON_PATTERNS(rshift16##EXT), \
399  NULL \
400  }, \
401 };
402 
403 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
404  DECL_PACKED_RW(_m2##EXT, 32) \
405  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
406  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
407  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
408  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
409  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
410  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
411  DECL_CONVERT(EXT, U8, U32) \
412  DECL_CONVERT(EXT, U32, U8) \
413  DECL_CONVERT(EXT, U16, U32) \
414  DECL_CONVERT(EXT, U32, U16) \
415  DECL_CONVERT(EXT, U8, F32) \
416  DECL_CONVERT(EXT, F32, U8) \
417  DECL_CONVERT(EXT, U16, F32) \
418  DECL_CONVERT(EXT, F32, U16) \
419  DECL_EXPAND(EXT, U8, U32) \
420  DECL_MIN_MAX(EXT) \
421  DECL_SCALE(EXT) \
422  DECL_DITHER(EXT, 0) \
423  DECL_DITHER(EXT, 1) \
424  DECL_DITHER(EXT, 2) \
425  DECL_DITHER(EXT, 3) \
426  DECL_DITHER(EXT, 4) \
427  DECL_DITHER(EXT, 5) \
428  DECL_DITHER(EXT, 6) \
429  DECL_DITHER(EXT, 7) \
430  DECL_DITHER(EXT, 8) \
431  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
432  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
433  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
434  DECL_LINEAR(EXT, dot3, 0x7) \
435  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
436  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
437  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
438  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
439  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
440  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
441  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
442  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
443  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
444  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
445  \
446 static const SwsOpTable ops32##EXT = { \
447  .cpu_flags = AV_CPU_FLAG_##FLAG, \
448  .block_size = SIZE, \
449  .entries = { \
450  &op_read32_packed2_m2##EXT, \
451  &op_read32_packed3_m2##EXT, \
452  &op_read32_packed4_m2##EXT, \
453  &op_write32_packed2_m2##EXT, \
454  &op_write32_packed3_m2##EXT, \
455  &op_write32_packed4_m2##EXT, \
456  &op_pack_1010102_m2##EXT, \
457  &op_pack_2101010_m2##EXT, \
458  &op_unpack_1010102_m2##EXT, \
459  &op_unpack_2101010_m2##EXT, \
460  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
461  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
462  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
463  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
464  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
465  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
466  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
467  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
468  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
469  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
470  REF_COMMON_PATTERNS(min##EXT), \
471  REF_COMMON_PATTERNS(max##EXT), \
472  REF_COMMON_PATTERNS(scale##EXT), \
473  REF_COMMON_PATTERNS(dither0##EXT), \
474  REF_COMMON_PATTERNS(dither1##EXT), \
475  REF_COMMON_PATTERNS(dither2##EXT), \
476  REF_COMMON_PATTERNS(dither3##EXT), \
477  REF_COMMON_PATTERNS(dither4##EXT), \
478  REF_COMMON_PATTERNS(dither5##EXT), \
479  REF_COMMON_PATTERNS(dither6##EXT), \
480  REF_COMMON_PATTERNS(dither7##EXT), \
481  REF_COMMON_PATTERNS(dither8##EXT), \
482  &op_luma##EXT, \
483  &op_alpha##EXT, \
484  &op_lumalpha##EXT, \
485  &op_dot3##EXT, \
486  &op_row0##EXT, \
487  &op_row0a##EXT, \
488  &op_diag3##EXT, \
489  &op_diag4##EXT, \
490  &op_diagoff3##EXT, \
491  &op_matrix3##EXT, \
492  &op_affine3##EXT, \
493  &op_affine3a##EXT, \
494  &op_matrix4##EXT, \
495  &op_affine4##EXT, \
496  NULL \
497  }, \
498 };
499 
500 DECL_FUNCS_8(16, _m1_sse4, SSE4)
501 DECL_FUNCS_8(32, _m1_avx2, AVX2)
502 DECL_FUNCS_8(32, _m2_sse4, SSE4)
503 DECL_FUNCS_8(64, _m2_avx2, AVX2)
504 
505 DECL_FUNCS_16(16, _m1_avx2, AVX2)
506 DECL_FUNCS_16(32, _m2_avx2, AVX2)
507 
508 DECL_FUNCS_32(16, _avx2, AVX2)
509 
510 static av_const int get_mmsize(const int cpu_flags)
511 {
513  return 64;
514  else if (cpu_flags & AV_CPU_FLAG_AVX2)
515  return 32;
516  else if (cpu_flags & AV_CPU_FLAG_SSE4)
517  return 16;
518  else
519  return AVERROR(ENOTSUP);
520 }
521 
522 /**
523  * Returns true if the operation's implementation only depends on the block
524  * size, and not the underlying pixel type
525  */
526 static bool op_is_type_invariant(const SwsOp *op)
527 {
528  switch (op->op) {
529  case SWS_OP_READ:
530  case SWS_OP_WRITE:
531  return !op->rw.packed && !op->rw.frac;
532  case SWS_OP_SWIZZLE:
533  case SWS_OP_CLEAR:
534  return true;
535  }
536 
537  return false;
538 }
539 
540 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
541 {
542  uint8_t shuffle[16];
543  int read_bytes, write_bytes;
544  int pixels;
545 
546  /* Solve the shuffle mask for one 128-bit lane only */
547  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
548  if (pixels < 0)
549  return pixels;
550 
551  /* We can't shuffle acress lanes, so restrict the vector size to XMM
552  * whenever the read/write size would be a subset of the full vector */
553  if (read_bytes < 16 || write_bytes < 16)
554  mmsize = 16;
555 
556  const int num_lanes = mmsize / 16;
557  const int in_total = num_lanes * read_bytes;
558  const int out_total = num_lanes * write_bytes;
559  const int read_size = in_total <= 4 ? 4 : /* movd */
560  in_total <= 8 ? 8 : /* movq */
561  mmsize; /* movu */
562 
563  *out = (SwsCompiledOp) {
564  .priv = av_memdup(shuffle, sizeof(shuffle)),
565  .free = av_free,
566  .block_size = pixels * num_lanes,
567  .over_read = read_size - in_total,
568  .over_write = mmsize - out_total,
569  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
570  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
572  };
573 
574  if (!out->priv)
575  return AVERROR(ENOMEM);
576 
577 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
578 do { \
579  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
580  if (in_total == IN && out_total == OUT) \
581  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
582 } while (0)
583 
584  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
585  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
586  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
587  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
588  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
589  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
590  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
591  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
592  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
593  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
594  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
595  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
596  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
597  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
598  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
599  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
600  av_assert1(out->func);
601  return 0;
602 }
603 
604 /* Normalize clear values into 32-bit integer constants */
605 static void normalize_clear(SwsOp *op)
606 {
607  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
608  SwsOpPriv priv;
609  union {
610  uint32_t u32;
611  int i;
612  } c;
613 
614  ff_sws_setup_q4(op, &priv);
615  for (int i = 0; i < 4; i++) {
616  if (!op->c.q4[i].den)
617  continue;
618  switch (ff_sws_pixel_type_size(op->type)) {
619  case 1: c.u32 = 0x1010101 * priv.u8[i]; break;
620  case 2: c.u32 = priv.u16[i] << 16 | priv.u16[i]; break;
621  case 4: c.u32 = priv.u32[i]; break;
622  }
623 
624  op->c.q4[i].num = c.i;
625  op->c.q4[i].den = 1;
626  }
627 }
628 
630 {
631  const int cpu_flags = av_get_cpu_flags();
632  const int mmsize = get_mmsize(cpu_flags);
633  if (mmsize < 0)
634  return mmsize;
635 
636  av_assert1(ops->num_ops > 0);
637  const SwsOp read = ops->ops[0];
638  const SwsOp write = ops->ops[ops->num_ops - 1];
639  int ret;
640 
641  /* Special fast path for in-place packed shuffle */
642  ret = solve_shuffle(ops, mmsize, out);
643  if (ret != AVERROR(ENOTSUP))
644  return ret;
645 
647  if (!chain)
648  return AVERROR(ENOMEM);
649 
650  *out = (SwsCompiledOp) {
651  .priv = chain,
652  .free = (void (*)(void *)) ff_sws_op_chain_free,
653 
654  /* Use at most two full YMM regs during the widest precision section */
655  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
656  };
657 
658  /* 3-component reads/writes process one extra garbage word */
659  if (read.rw.packed && read.rw.elems == 3)
660  out->over_read = sizeof(uint32_t);
661  if (write.rw.packed && write.rw.elems == 3)
662  out->over_write = sizeof(uint32_t);
663 
664  static const SwsOpTable *const tables[] = {
665  &ops8_m1_sse4,
666  &ops8_m1_avx2,
667  &ops8_m2_sse4,
668  &ops8_m2_avx2,
669  &ops16_m1_avx2,
670  &ops16_m2_avx2,
671  &ops32_avx2,
672  };
673 
674  do {
675  int op_block_size = out->block_size;
676  SwsOp *op = &ops->ops[0];
677 
678  if (op_is_type_invariant(op)) {
679  if (op->op == SWS_OP_CLEAR)
681  op_block_size *= ff_sws_pixel_type_size(op->type);
682  op->type = SWS_PIXEL_U8;
683  }
684 
686  op_block_size, chain);
687  } while (ret == AVERROR(EAGAIN));
688  if (ret < 0) {
689  ff_sws_op_chain_free(chain);
690  return ret;
691  }
692 
693 #define ASSIGN_PROCESS_FUNC(NAME) \
694  do { \
695  SWS_DECL_FUNC(NAME); \
696  void NAME##_return(void); \
697  ret = ff_sws_op_chain_append(chain, NAME##_return, \
698  NULL, (SwsOpPriv) {0}); \
699  out->func = NAME; \
700  } while (0)
701 
702  const int read_planes = read.rw.packed ? 1 : read.rw.elems;
703  const int write_planes = write.rw.packed ? 1 : write.rw.elems;
704  switch (FFMAX(read_planes, write_planes)) {
705  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
706  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
707  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
708  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
709  }
710 
711  if (ret < 0) {
712  ff_sws_op_chain_free(chain);
713  return ret;
714  }
715 
716  out->cpu_flags = chain->cpu_flags;
717  return 0;
718 }
719 
721  .name = "x86",
722  .compile = compile,
723 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:48
SwsOpTable
Definition: ops_chain.h:118
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:58
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:510
out
FILE * out
Definition: movenc.c:55
setup_linear
static int setup_linear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:213
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:55
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:317
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:720
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:184
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:605
av_const
#define av_const
Definition: attributes.h:84
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:403
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
SwsOpBackend::name
const char * name
Definition: ops_internal.h:104
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:64
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:85
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:49
SwsOpList::num_ops
int num_ops
Definition: ops.h:211
tables
Writing a table generator This documentation is preliminary Parts of the API are not good and should be changed Basic concepts A table generator consists of two *_tablegen c and *_tablegen h The h file will provide the variable declarations and initialization code for the tables
Definition: tablegen.txt:10
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:32
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:57
avassert.h
setup_dither
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:189
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ctx
AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:45
SwsOpBackend
Definition: ops_internal.h:103
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:80
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:195
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:540
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:53
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:47
size
int size
Definition: twinvq_data.h:10344
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:49
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:48
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:629
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(void *)
Definition: ops_chain.h:83
setup_swap_bytes
static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:86
SwsOpList::ops
SwsOp * ops
Definition: ops.h:210
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:526
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out)
Definition: ops_chain.c:278
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:57
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:256
SwsOp
Definition: ops.h:179
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_internal.h:90
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
SwsReadWriteOp::packed
bool packed
Definition: ops.h:99
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:774
setup_shift
static int setup_shift(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:149
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:97
mem.h
setup_clear
static int setup_clear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:116
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
ff_sws_op_chain_free
void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.c:34
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:363
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:209
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:235
SwsContext
Main external API structure.
Definition: swscale.h:189
SwsOpPriv
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:42
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239