FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
62  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
63  .op = SWS_OP_##OP, \
64  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
65  );
66 
67 #define DECL_PACKED_RW(EXT, DEPTH) \
68  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
69  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
70  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
71  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
72  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
73  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
74 
75 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
76  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
77  .op = SWS_OP_PACK, \
78  .pack.pattern = {X, Y, Z, W}, \
79  ); \
80  \
81  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
82  .op = SWS_OP_UNPACK, \
83  .pack.pattern = {X, Y, Z, W}, \
84  ); \
85 
86 static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87 {
88  const int mask = ff_sws_pixel_type_size(op->type) - 1;
89  for (int i = 0; i < 16; i++)
90  out->u8[i] = (i & ~mask) | (mask - (i & mask));
91  return 0;
92 }
93 
94 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
95  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
96  .op = SWS_OP_SWAP_BYTES, \
97  .unused = { !X, !Y, !Z, !W }, \
98  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
99  .setup = setup_swap_bytes, \
100  );
101 
102 #define DECL_CLEAR_ALPHA(EXT, IDX) \
103  DECL_ASM(U8, clear_alpha##IDX##EXT, \
104  .op = SWS_OP_CLEAR, \
105  .clear_value = -1, \
106  .unused[IDX] = true, \
107  ); \
108 
109 #define DECL_CLEAR_ZERO(EXT, IDX) \
110  DECL_ASM(U8, clear_zero##IDX##EXT, \
111  .op = SWS_OP_CLEAR, \
112  .clear_value = 0, \
113  .unused[IDX] = true, \
114  );
115 
116 static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117 {
118  for (int i = 0; i < 4; i++)
119  out->u32[i] = (uint32_t) op->c.q4[i].num;
120  return 0;
121 }
122 
123 #define DECL_CLEAR(EXT, X, Y, Z, W) \
124  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
125  .op = SWS_OP_CLEAR, \
126  .setup = setup_clear, \
127  .flexible = true, \
128  );
129 
130 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
131  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
132  .op = SWS_OP_SWIZZLE, \
133  .swizzle.in = {X, Y, Z, W}, \
134  );
135 
136 #define DECL_CONVERT(EXT, FROM, TO) \
137  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
138  .op = SWS_OP_CONVERT, \
139  .convert.to = SWS_PIXEL_##TO, \
140  );
141 
142 #define DECL_EXPAND(EXT, FROM, TO) \
143  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
144  .op = SWS_OP_CONVERT, \
145  .convert.to = SWS_PIXEL_##TO, \
146  .convert.expand = true, \
147  );
148 
149 static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150 {
151  out->u16[0] = op->c.u;
152  return 0;
153 }
154 
155 #define DECL_SHIFT16(EXT) \
156  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
157  .op = SWS_OP_LSHIFT, \
158  .setup = setup_shift, \
159  .flexible = true, \
160  ); \
161  \
162  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
163  .op = SWS_OP_RSHIFT, \
164  .setup = setup_shift, \
165  .flexible = true, \
166  );
167 
168 #define DECL_MIN_MAX(EXT) \
169  DECL_COMMON_PATTERNS(F32, min##EXT, \
170  .op = SWS_OP_MIN, \
171  .setup = ff_sws_setup_q4, \
172  .flexible = true, \
173  ); \
174  \
175  DECL_COMMON_PATTERNS(F32, max##EXT, \
176  .op = SWS_OP_MAX, \
177  .setup = ff_sws_setup_q4, \
178  .flexible = true, \
179  );
180 
181 #define DECL_SCALE(EXT) \
182  DECL_COMMON_PATTERNS(F32, scale##EXT, \
183  .op = SWS_OP_SCALE, \
184  .setup = ff_sws_setup_q, \
185  .flexible = true, \
186  );
187 
188 #define DECL_EXPAND_BITS(EXT, BITS) \
189  DECL_ASM(U##BITS, expand_bits##BITS##EXT, \
190  .op = SWS_OP_SCALE, \
191  .scale = Q((1 << (BITS)) - 1), \
192  );
193 
194 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
195 {
196  /* 1x1 matrix / single constant */
197  if (!op->dither.size_log2) {
198  const AVRational k = op->dither.matrix[0];
199  out->f32[0] = (float) k.num / k.den;
200  return 0;
201  }
202 
203  const int size = 1 << op->dither.size_log2;
204  const int8_t *off = op->dither.y_offset;
205  int max_offset = 0;
206  for (int i = 0; i < 4; i++) {
207  if (off[i] >= 0)
208  max_offset = FFMAX(max_offset, off[i] & (size - 1));
209  }
210 
211  /* Allocate extra rows to allow over-reading for row offsets. Note that
212  * max_offset is currently never larger than 5, so the extra space needed
213  * for this over-allocation is bounded by 5 * size * sizeof(float),
214  * typically 320 bytes for a 16x16 dither matrix. */
215  const int stride = size * sizeof(float);
216  const int num_rows = size + max_offset;
217  float *matrix = out->ptr = av_mallocz(num_rows * stride);
218  if (!matrix)
219  return AVERROR(ENOMEM);
220 
221  for (int i = 0; i < size * size; i++)
222  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
223 
224  memcpy(&matrix[size * size], matrix, max_offset * stride);
225 
226  /* Store relative pointer offset to each row inside extra space */
227  static_assert(sizeof(out->ptr) <= sizeof(int16_t[4]), ">8 byte pointers not supported");
228  assert(max_offset * stride <= INT16_MAX);
229  int16_t *off_out = &out->i16[4];
230  for (int i = 0; i < 4; i++)
231  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
232 
233  return 0;
234 }
235 
236 #define DECL_DITHER(DECL_MACRO, EXT, SIZE) \
237  DECL_MACRO(F32, dither##SIZE##EXT, \
238  .op = SWS_OP_DITHER, \
239  .setup = setup_dither, \
240  .free = (SIZE) ? av_free : NULL, \
241  .dither_size = SIZE, \
242  );
243 
244 static int setup_linear(const SwsOp *op, SwsOpPriv *out)
245 {
246  float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
247  if (!matrix)
248  return AVERROR(ENOMEM);
249 
250  for (int y = 0; y < 4; y++) {
251  for (int x = 0; x < 5; x++)
252  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
253  }
254 
255  return 0;
256 }
257 
258 #define DECL_LINEAR(EXT, NAME, MASK) \
259  DECL_ASM(F32, NAME##EXT, \
260  .op = SWS_OP_LINEAR, \
261  .setup = setup_linear, \
262  .free = av_free, \
263  .linear_mask = (MASK), \
264  );
265 
266 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
267  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
268  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
269  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
270  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
271  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
272  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
273  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
274  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
275  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
276  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
277  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
278  DECL_EXPAND_BITS(EXT, 8) \
279  DECL_PACKED_RW(EXT, 8) \
280  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
281  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
282  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
283  void ff_p1000_shuffle##EXT(void); \
284  void ff_p1001_shuffle##EXT(void); \
285  void ff_p1110_shuffle##EXT(void); \
286  void ff_p1111_shuffle##EXT(void); \
287  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
288  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
289  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
290  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
291  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
292  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
293  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
294  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
295  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
296  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
297  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
298  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
299  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
300  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
301  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
302  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
303  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
304  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
305  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
306  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
307  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
308  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
309  DECL_CLEAR_ALPHA(EXT, 0) \
310  DECL_CLEAR_ALPHA(EXT, 1) \
311  DECL_CLEAR_ALPHA(EXT, 3) \
312  DECL_CLEAR_ZERO(EXT, 0) \
313  DECL_CLEAR_ZERO(EXT, 1) \
314  DECL_CLEAR_ZERO(EXT, 3) \
315  DECL_CLEAR(EXT, 1, 1, 1, 0) \
316  DECL_CLEAR(EXT, 0, 1, 1, 1) \
317  DECL_CLEAR(EXT, 0, 0, 1, 1) \
318  DECL_CLEAR(EXT, 1, 0, 0, 1) \
319  DECL_CLEAR(EXT, 1, 1, 0, 0) \
320  DECL_CLEAR(EXT, 0, 1, 0, 1) \
321  DECL_CLEAR(EXT, 1, 0, 1, 0) \
322  DECL_CLEAR(EXT, 1, 0, 0, 0) \
323  DECL_CLEAR(EXT, 0, 1, 0, 0) \
324  DECL_CLEAR(EXT, 0, 0, 1, 0) \
325  \
326 static const SwsOpTable ops8##EXT = { \
327  .cpu_flags = AV_CPU_FLAG_##FLAG, \
328  .block_size = SIZE, \
329  .entries = { \
330  &op_read_planar1##EXT, \
331  &op_read_planar2##EXT, \
332  &op_read_planar3##EXT, \
333  &op_read_planar4##EXT, \
334  &op_write_planar1##EXT, \
335  &op_write_planar2##EXT, \
336  &op_write_planar3##EXT, \
337  &op_write_planar4##EXT, \
338  &op_read8_packed2##EXT, \
339  &op_read8_packed3##EXT, \
340  &op_read8_packed4##EXT, \
341  &op_write8_packed2##EXT, \
342  &op_write8_packed3##EXT, \
343  &op_write8_packed4##EXT, \
344  &op_read_nibbles1##EXT, \
345  &op_read_bits1##EXT, \
346  &op_write_bits1##EXT, \
347  &op_expand_bits8##EXT, \
348  &op_pack_1210##EXT, \
349  &op_pack_3320##EXT, \
350  &op_pack_2330##EXT, \
351  &op_unpack_1210##EXT, \
352  &op_unpack_3320##EXT, \
353  &op_unpack_2330##EXT, \
354  &op_swizzle_3012##EXT, \
355  &op_swizzle_3021##EXT, \
356  &op_swizzle_2103##EXT, \
357  &op_swizzle_3210##EXT, \
358  &op_swizzle_3102##EXT, \
359  &op_swizzle_3201##EXT, \
360  &op_swizzle_1203##EXT, \
361  &op_swizzle_1023##EXT, \
362  &op_swizzle_2013##EXT, \
363  &op_swizzle_2310##EXT, \
364  &op_swizzle_2130##EXT, \
365  &op_swizzle_1230##EXT, \
366  &op_swizzle_1320##EXT, \
367  &op_swizzle_0213##EXT, \
368  &op_swizzle_0231##EXT, \
369  &op_swizzle_0312##EXT, \
370  &op_swizzle_3120##EXT, \
371  &op_swizzle_0321##EXT, \
372  &op_swizzle_0003##EXT, \
373  &op_swizzle_0001##EXT, \
374  &op_swizzle_3000##EXT, \
375  &op_swizzle_1000##EXT, \
376  &op_clear_alpha0##EXT, \
377  &op_clear_alpha1##EXT, \
378  &op_clear_alpha3##EXT, \
379  &op_clear_zero0##EXT, \
380  &op_clear_zero1##EXT, \
381  &op_clear_zero3##EXT, \
382  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
383  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
384  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
385  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
386  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
387  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
388  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
389  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
390  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
391  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
392  NULL \
393  }, \
394 };
395 
396 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
397  DECL_PACKED_RW(EXT, 16) \
398  DECL_EXPAND_BITS(EXT, 16) \
399  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
400  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
401  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
402  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
403  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
404  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
405  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
406  DECL_SHIFT16(EXT) \
407  DECL_CONVERT(EXT, U8, U16) \
408  DECL_CONVERT(EXT, U16, U8) \
409  DECL_EXPAND(EXT, U8, U16) \
410  \
411 static const SwsOpTable ops16##EXT = { \
412  .cpu_flags = AV_CPU_FLAG_##FLAG, \
413  .block_size = SIZE, \
414  .entries = { \
415  &op_read16_packed2##EXT, \
416  &op_read16_packed3##EXT, \
417  &op_read16_packed4##EXT, \
418  &op_write16_packed2##EXT, \
419  &op_write16_packed3##EXT, \
420  &op_write16_packed4##EXT, \
421  &op_pack_4440##EXT, \
422  &op_pack_5550##EXT, \
423  &op_pack_5650##EXT, \
424  &op_unpack_4440##EXT, \
425  &op_unpack_5550##EXT, \
426  &op_unpack_5650##EXT, \
427  &op_expand_bits16##EXT, \
428  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
429  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
430  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
431  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
432  REF_COMMON_PATTERNS(lshift16##EXT), \
433  REF_COMMON_PATTERNS(rshift16##EXT), \
434  NULL \
435  }, \
436 };
437 
438 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
439  DECL_PACKED_RW(_m2##EXT, 32) \
440  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
441  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
442  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
443  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
444  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
445  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
446  DECL_CONVERT(EXT, U8, U32) \
447  DECL_CONVERT(EXT, U32, U8) \
448  DECL_CONVERT(EXT, U16, U32) \
449  DECL_CONVERT(EXT, U32, U16) \
450  DECL_CONVERT(EXT, U8, F32) \
451  DECL_CONVERT(EXT, F32, U8) \
452  DECL_CONVERT(EXT, U16, F32) \
453  DECL_CONVERT(EXT, F32, U16) \
454  DECL_EXPAND(EXT, U8, U32) \
455  DECL_MIN_MAX(EXT) \
456  DECL_SCALE(EXT) \
457  DECL_DITHER(DECL_COMMON_PATTERNS, EXT, 0) \
458  DECL_DITHER(DECL_ASM, EXT, 1) \
459  DECL_DITHER(DECL_ASM, EXT, 2) \
460  DECL_DITHER(DECL_ASM, EXT, 3) \
461  DECL_DITHER(DECL_ASM, EXT, 4) \
462  DECL_DITHER(DECL_ASM, EXT, 5) \
463  DECL_DITHER(DECL_ASM, EXT, 6) \
464  DECL_DITHER(DECL_ASM, EXT, 7) \
465  DECL_DITHER(DECL_ASM, EXT, 8) \
466  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
467  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
468  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
469  DECL_LINEAR(EXT, dot3, 0x7) \
470  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
471  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
472  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
473  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
474  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
475  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
476  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
477  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
478  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
479  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
480  \
481 static const SwsOpTable ops32##EXT = { \
482  .cpu_flags = AV_CPU_FLAG_##FLAG, \
483  .block_size = SIZE, \
484  .entries = { \
485  &op_read32_packed2_m2##EXT, \
486  &op_read32_packed3_m2##EXT, \
487  &op_read32_packed4_m2##EXT, \
488  &op_write32_packed2_m2##EXT, \
489  &op_write32_packed3_m2##EXT, \
490  &op_write32_packed4_m2##EXT, \
491  &op_pack_1010102_m2##EXT, \
492  &op_pack_2101010_m2##EXT, \
493  &op_unpack_1010102_m2##EXT, \
494  &op_unpack_2101010_m2##EXT, \
495  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
496  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
497  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
498  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
499  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
500  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
501  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
502  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
503  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
504  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
505  REF_COMMON_PATTERNS(min##EXT), \
506  REF_COMMON_PATTERNS(max##EXT), \
507  REF_COMMON_PATTERNS(scale##EXT), \
508  REF_COMMON_PATTERNS(dither0##EXT), \
509  &op_dither1##EXT, \
510  &op_dither2##EXT, \
511  &op_dither3##EXT, \
512  &op_dither4##EXT, \
513  &op_dither5##EXT, \
514  &op_dither6##EXT, \
515  &op_dither7##EXT, \
516  &op_dither8##EXT, \
517  &op_luma##EXT, \
518  &op_alpha##EXT, \
519  &op_lumalpha##EXT, \
520  &op_dot3##EXT, \
521  &op_row0##EXT, \
522  &op_row0a##EXT, \
523  &op_diag3##EXT, \
524  &op_diag4##EXT, \
525  &op_diagoff3##EXT, \
526  &op_matrix3##EXT, \
527  &op_affine3##EXT, \
528  &op_affine3a##EXT, \
529  &op_matrix4##EXT, \
530  &op_affine4##EXT, \
531  NULL \
532  }, \
533 };
534 
535 DECL_FUNCS_8(16, _m1_sse4, SSE4)
536 DECL_FUNCS_8(32, _m1_avx2, AVX2)
537 DECL_FUNCS_8(32, _m2_sse4, SSE4)
538 DECL_FUNCS_8(64, _m2_avx2, AVX2)
539 
540 DECL_FUNCS_16(16, _m1_avx2, AVX2)
541 DECL_FUNCS_16(32, _m2_avx2, AVX2)
542 
543 DECL_FUNCS_32(16, _avx2, AVX2)
544 
545 static av_const int get_mmsize(const int cpu_flags)
546 {
548  return 64;
549  else if (cpu_flags & AV_CPU_FLAG_AVX2)
550  return 32;
551  else if (cpu_flags & AV_CPU_FLAG_SSE4)
552  return 16;
553  else
554  return AVERROR(ENOTSUP);
555 }
556 
557 /**
558  * Returns true if the operation's implementation only depends on the block
559  * size, and not the underlying pixel type
560  */
561 static bool op_is_type_invariant(const SwsOp *op)
562 {
563  switch (op->op) {
564  case SWS_OP_READ:
565  case SWS_OP_WRITE:
566  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac;
567  case SWS_OP_SWIZZLE:
568  case SWS_OP_CLEAR:
569  return true;
570  }
571 
572  return false;
573 }
574 
575 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
576 {
577  uint8_t shuffle[16];
578  int read_bytes, write_bytes;
579  int pixels;
580 
581  /* Solve the shuffle mask for one 128-bit lane only */
582  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
583  if (pixels < 0)
584  return pixels;
585 
586  /* We can't shuffle acress lanes, so restrict the vector size to XMM
587  * whenever the read/write size would be a subset of the full vector */
588  if (read_bytes < 16 || write_bytes < 16)
589  mmsize = 16;
590 
591  const int num_lanes = mmsize / 16;
592  const int in_total = num_lanes * read_bytes;
593  const int out_total = num_lanes * write_bytes;
594  const int read_size = in_total <= 4 ? 4 : /* movd */
595  in_total <= 8 ? 8 : /* movq */
596  mmsize; /* movu */
597 
598  *out = (SwsCompiledOp) {
599  .priv = av_memdup(shuffle, sizeof(shuffle)),
600  .free = av_free,
601  .slice_align = 1,
602  .block_size = pixels * num_lanes,
603  .over_read = read_size - in_total,
604  .over_write = mmsize - out_total,
605  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
606  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
608  };
609 
610  if (!out->priv)
611  return AVERROR(ENOMEM);
612 
613 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
614 do { \
615  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
616  if (in_total == IN && out_total == OUT) \
617  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
618 } while (0)
619 
620  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
621  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
622  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
623  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
624  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
625  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
626  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
627  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
628  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
629  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
630  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
631  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
632  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
633  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
634  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
635  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
636  av_assert1(out->func);
637  return 0;
638 }
639 
640 /* Normalize clear values into 32-bit integer constants */
641 static void normalize_clear(SwsOp *op)
642 {
643  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
644  SwsOpPriv priv;
645  union {
646  uint32_t u32;
647  int i;
648  } c;
649 
650  ff_sws_setup_q4(op, &priv);
651  for (int i = 0; i < 4; i++) {
652  if (!op->c.q4[i].den)
653  continue;
654  switch (ff_sws_pixel_type_size(op->type)) {
655  case 1: c.u32 = 0x1010101U * priv.u8[i]; break;
656  case 2: c.u32 = (uint32_t)priv.u16[i] << 16 | priv.u16[i]; break;
657  case 4: c.u32 = priv.u32[i]; break;
658  }
659 
660  op->c.q4[i].num = c.i;
661  op->c.q4[i].den = 1;
662  }
663 }
664 
666 {
667  const int cpu_flags = av_get_cpu_flags();
668  const int mmsize = get_mmsize(cpu_flags);
669  if (mmsize < 0)
670  return mmsize;
671 
672  av_assert1(ops->num_ops > 0);
673  const SwsOp *read = ops->ops[0].op == SWS_OP_READ ? &ops->ops[0] : NULL;
674  const SwsOp *write = &ops->ops[ops->num_ops - 1];
675  int ret;
676 
677  /* Special fast path for in-place packed shuffle */
678  ret = solve_shuffle(ops, mmsize, out);
679  if (ret != AVERROR(ENOTSUP))
680  return ret;
681 
683  if (!chain)
684  return AVERROR(ENOMEM);
685 
686  *out = (SwsCompiledOp) {
687  .priv = chain,
688  .slice_align = 1,
690 
691  /* Use at most two full YMM regs during the widest precision section */
692  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
693  };
694 
695  /* 3-component reads/writes process one extra garbage word */
696  if (read && read->rw.packed && read->rw.elems == 3)
697  out->over_read = sizeof(uint32_t);
698  if (write->rw.packed && write->rw.elems == 3)
699  out->over_write = sizeof(uint32_t);
700 
701  static const SwsOpTable *const tables[] = {
702  &ops8_m1_sse4,
703  &ops8_m1_avx2,
704  &ops8_m2_sse4,
705  &ops8_m2_avx2,
706  &ops16_m1_avx2,
707  &ops16_m2_avx2,
708  &ops32_avx2,
709  };
710 
711  do {
712  int op_block_size = out->block_size;
713  SwsOp *op = &ops->ops[0];
714 
715  if (op_is_type_invariant(op)) {
716  if (op->op == SWS_OP_CLEAR)
718  op_block_size *= ff_sws_pixel_type_size(op->type);
719  op->type = SWS_PIXEL_U8;
720  }
721 
723  op_block_size, chain);
724  } while (ret == AVERROR(EAGAIN));
725  if (ret < 0) {
726  ff_sws_op_chain_free(chain);
727  return ret;
728  }
729 
730 #define ASSIGN_PROCESS_FUNC(NAME) \
731  do { \
732  SWS_DECL_FUNC(NAME); \
733  void NAME##_return(void); \
734  ret = ff_sws_op_chain_append(chain, NAME##_return, \
735  NULL, &(SwsOpPriv) {0}); \
736  out->func = NAME; \
737  } while (0)
738 
739  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
740  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
741  switch (FFMAX(read_planes, write_planes)) {
742  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
743  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
744  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
745  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
746  }
747 
748  if (ret < 0) {
749  ff_sws_op_chain_free(chain);
750  return ret;
751  }
752 
753  out->cpu_flags = chain->cpu_flags;
754  return 0;
755 }
756 
758  .name = "x86",
759  .compile = compile,
760  .hw_format = AV_PIX_FMT_NONE,
761 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:47
SwsOpTable
Definition: ops_chain.h:125
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:50
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:545
out
static FILE * out
Definition: movenc.c:55
setup_linear
static int setup_linear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:244
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:59
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:610
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:757
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:191
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:641
av_const
#define av_const
Definition: attributes.h:100
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:438
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
SwsOpBackend::name
const char * name
Definition: ops_internal.h:114
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:69
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:87
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:51
SwsOpList::num_ops
int num_ops
Definition: ops.h:224
tables
Writing a table generator This documentation is preliminary Parts of the API are not good and should be changed Basic concepts A table generator consists of two *_tablegen c and *_tablegen h The h file will provide the variable declarations and initialization code for the tables
Definition: tablegen.txt:10
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:32
AVRational::num
int num
Numerator.
Definition: rational.h:59
SwsOp::op
SwsOpType op
Definition: ops.h:187
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
avassert.h
setup_dither
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:194
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:113
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:82
NULL
#define NULL
Definition: coverity.c:32
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:196
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:575
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:48
size
int size
Definition: twinvq_data.h:10344
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:48
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:49
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:665
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(void *)
Definition: ops_chain.h:85
setup_swap_bytes
static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:86
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:92
SwsOpList::ops
SwsOp * ops
Definition: ops.h:223
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:561
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out)
Definition: ops_chain.c:279
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:186
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_internal.h:99
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:101
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:686
setup_shift
static int setup_shift(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:149
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:99
mem.h
setup_clear
static int setup_clear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:116
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:396
stride
#define stride
Definition: h264pred_template.c:536
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:222
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:266
SwsContext
Main external API structure.
Definition: swscale.h:191
SwsOpPriv
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:42
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239