FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
62 {
63  const SwsOp *op = params->op;
64 
65  /* 3-component reads/writes process one extra garbage word */
66  if (op->rw.packed && op->rw.elems == 3) {
67  switch (op->op) {
68  case SWS_OP_READ: out->over_read = sizeof(uint32_t); break;
69  case SWS_OP_WRITE: out->over_write = sizeof(uint32_t); break;
70  }
71  }
72 
73  return 0;
74 }
75 
76 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
77  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
78  .op = SWS_OP_##OP, \
79  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
80  .setup = setup_rw, \
81  );
82 
83 #define DECL_PACKED_RW(EXT, DEPTH) \
84  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
85  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
86  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
87  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
88  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
89  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
90 
91 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
92  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
93  .op = SWS_OP_PACK, \
94  .pack.pattern = {X, Y, Z, W}, \
95  ); \
96  \
97  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
98  .op = SWS_OP_UNPACK, \
99  .pack.pattern = {X, Y, Z, W}, \
100  ); \
101 
102 static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
103 {
104  const int mask = ff_sws_pixel_type_size(params->op->type) - 1;
105  for (int i = 0; i < 16; i++)
106  out->priv.u8[i] = (i & ~mask) | (mask - (i & mask));
107  return 0;
108 }
109 
110 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
111  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
112  .op = SWS_OP_SWAP_BYTES, \
113  .unused = { !X, !Y, !Z, !W }, \
114  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
115  .setup = setup_swap_bytes, \
116  );
117 
118 #define DECL_CLEAR_ALPHA(EXT, IDX) \
119  DECL_ASM(U8, clear_alpha##IDX##EXT, \
120  .op = SWS_OP_CLEAR, \
121  .clear_value = -1, \
122  .unused[IDX] = true, \
123  ); \
124 
125 #define DECL_CLEAR_ZERO(EXT, IDX) \
126  DECL_ASM(U8, clear_zero##IDX##EXT, \
127  .op = SWS_OP_CLEAR, \
128  .clear_value = 0, \
129  .unused[IDX] = true, \
130  );
131 
132 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
133 {
134  const SwsOp *op = params->op;
135  for (int i = 0; i < 4; i++)
136  out->priv.u32[i] = (uint32_t) op->clear.value[i].num;
137  return 0;
138 }
139 
140 #define DECL_CLEAR(EXT, X, Y, Z, W) \
141  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
142  .op = SWS_OP_CLEAR, \
143  .setup = setup_clear, \
144  .flexible = true, \
145  );
146 
147 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
148  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
149  .op = SWS_OP_SWIZZLE, \
150  .swizzle.in = {X, Y, Z, W}, \
151  );
152 
153 #define DECL_CONVERT(EXT, FROM, TO) \
154  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
155  .op = SWS_OP_CONVERT, \
156  .convert.to = SWS_PIXEL_##TO, \
157  );
158 
159 #define DECL_EXPAND(EXT, FROM, TO) \
160  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
161  .op = SWS_OP_CONVERT, \
162  .convert.to = SWS_PIXEL_##TO, \
163  .convert.expand = true, \
164  );
165 
166 static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
167 {
168  out->priv.u16[0] = params->op->shift.amount;
169  return 0;
170 }
171 
172 #define DECL_SHIFT16(EXT) \
173  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
174  .op = SWS_OP_LSHIFT, \
175  .setup = setup_shift, \
176  .flexible = true, \
177  ); \
178  \
179  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
180  .op = SWS_OP_RSHIFT, \
181  .setup = setup_shift, \
182  .flexible = true, \
183  );
184 
185 #define DECL_MIN_MAX(EXT) \
186  DECL_COMMON_PATTERNS(F32, min##EXT, \
187  .op = SWS_OP_MIN, \
188  .setup = ff_sws_setup_clamp, \
189  .flexible = true, \
190  ); \
191  \
192  DECL_COMMON_PATTERNS(F32, max##EXT, \
193  .op = SWS_OP_MAX, \
194  .setup = ff_sws_setup_clamp, \
195  .flexible = true, \
196  );
197 
198 #define DECL_SCALE(EXT) \
199  DECL_COMMON_PATTERNS(F32, scale##EXT, \
200  .op = SWS_OP_SCALE, \
201  .setup = ff_sws_setup_scale, \
202  .flexible = true, \
203  );
204 
205 #define DECL_EXPAND_BITS(EXT, BITS) \
206  DECL_ASM(U##BITS, expand_bits##BITS##EXT, \
207  .op = SWS_OP_SCALE, \
208  .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
209  );
210 
211 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
212 {
213  const SwsOp *op = params->op;
214  /* 1x1 matrix / single constant */
215  if (!op->dither.size_log2) {
216  const AVRational k = op->dither.matrix[0];
217  out->priv.f32[0] = (float) k.num / k.den;
218  return 0;
219  }
220 
221  const int size = 1 << op->dither.size_log2;
222  const int8_t *off = op->dither.y_offset;
223  int max_offset = 0;
224  for (int i = 0; i < 4; i++) {
225  if (off[i] >= 0)
226  max_offset = FFMAX(max_offset, off[i] & (size - 1));
227  }
228 
229  /* Allocate extra rows to allow over-reading for row offsets. Note that
230  * max_offset is currently never larger than 5, so the extra space needed
231  * for this over-allocation is bounded by 5 * size * sizeof(float),
232  * typically 320 bytes for a 16x16 dither matrix. */
233  const int stride = size * sizeof(float);
234  const int num_rows = size + max_offset;
235  float *matrix = out->priv.ptr = av_mallocz(num_rows * stride);
236  if (!matrix)
237  return AVERROR(ENOMEM);
238  out->free = ff_op_priv_free;
239 
240  for (int i = 0; i < size * size; i++)
241  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
242 
243  memcpy(&matrix[size * size], matrix, max_offset * stride);
244 
245  /* Store relative pointer offset to each row inside extra space */
246  static_assert(sizeof(out->priv.ptr) <= sizeof(int16_t[4]),
247  ">8 byte pointers not supported");
248  assert(max_offset * stride <= INT16_MAX);
249  int16_t *off_out = &out->priv.i16[4];
250  for (int i = 0; i < 4; i++)
251  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
252 
253  return 0;
254 }
255 
256 #define DECL_DITHER(DECL_MACRO, EXT, SIZE) \
257  DECL_MACRO(F32, dither##SIZE##EXT, \
258  .op = SWS_OP_DITHER, \
259  .setup = setup_dither, \
260  .dither_size = SIZE, \
261  );
262 
263 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
264 {
265  const SwsOp *op = params->op;
266 
267  float *matrix = out->priv.ptr = av_mallocz(sizeof(float[4][5]));
268  if (!matrix)
269  return AVERROR(ENOMEM);
270  out->free = ff_op_priv_free;
271 
272  for (int y = 0; y < 4; y++) {
273  for (int x = 0; x < 5; x++)
274  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
275  }
276 
277  return 0;
278 }
279 
280 #define DECL_LINEAR(EXT, NAME, MASK) \
281  DECL_ASM(F32, NAME##EXT, \
282  .op = SWS_OP_LINEAR, \
283  .setup = setup_linear, \
284  .linear_mask = (MASK), \
285  );
286 
287 static bool check_filter_fma(const SwsImplParams *params)
288 {
289  const SwsOp *op = params->op;
290  SwsContext *ctx = params->ctx;
291  if (!(ctx->flags & SWS_BITEXACT))
292  return true;
293 
294  if (!ff_sws_pixel_type_is_int(op->type))
295  return false;
296 
297  /* Check if maximum/minimum partial sum fits losslessly inside float */
298  AVRational max_range = { 1 << 24, 1 };
299  AVRational min_range = { -(1 << 24), 1 };
301 
302  for (int i = 0; i < op->rw.elems; i++) {
303  const AVRational min = av_mul_q(op->comps.min[i], scale);
304  const AVRational max = av_mul_q(op->comps.max[i], scale);
305  if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0)
306  return false;
307  }
308 
309  return true;
310 }
311 
312 static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
313 {
314  const SwsFilterWeights *filter = params->op->rw.kernel;
315  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
316  ">8 byte pointers not supported");
317 
318  /* Pre-convert weights to float */
319  float *weights = av_calloc(filter->num_weights, sizeof(float));
320  if (!weights)
321  return AVERROR(ENOMEM);
322 
323  for (int i = 0; i < filter->num_weights; i++)
324  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
325 
326  out->priv.ptr = weights;
327  out->priv.uptr[1] = filter->filter_size;
328  out->free = ff_op_priv_free;
329  return 0;
330 }
331 
332 static int hscale_sizeof_weight(const SwsOp *op)
333 {
334  switch (op->type) {
335  case SWS_PIXEL_U8: return sizeof(int16_t);
336  case SWS_PIXEL_U16: return sizeof(int16_t);
337  case SWS_PIXEL_F32: return sizeof(float);
338  default: return 0;
339  }
340 }
341 
342 static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
343 {
344  const SwsOp *op = params->op;
345  const SwsFilterWeights *filter = op->rw.kernel;
346 
347  /**
348  * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
349  * size, we need to gather 2/4 taps simultaneously and unroll the inner
350  * loop over several packed samples.
351  */
352  const int taps_align = sizeof(int32_t) / ff_sws_pixel_type_size(op->type);
353  const int filter_size = filter->filter_size;
354  const int block_size = params->table->block_size;
355  const size_t aligned_size = FFALIGN(filter_size, taps_align);
356  const size_t line_size = FFALIGN(filter->dst_size, block_size);
357  av_assert1(FFALIGN(line_size, taps_align) == line_size);
358  if (aligned_size > INT_MAX)
359  return AVERROR(EINVAL);
360 
361  union {
362  void *ptr;
363  int16_t *i16;
364  float *f32;
365  } weights;
366 
367  const int sizeof_weight = hscale_sizeof_weight(op);
368  weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
369  if (!weights.ptr)
370  return AVERROR(ENOMEM);
371 
372  /**
373  * Transpose filter weights to group (aligned) taps by block
374  */
375  const int mmsize = block_size * 2;
376  const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */
377  for (size_t x = 0; x < line_size; x += block_size) {
378  const int elems = FFMIN(block_size, filter->dst_size - x);
379  for (int j = 0; j < filter_size; j++) {
380  const int jb = j & ~(taps_align - 1);
381  const int ji = j - jb;
382  const size_t idx_base = x * aligned_size + jb * block_size + ji;
383  for (int i = 0; i < elems; i++) {
384  const int w = filter->weights[(x + i) * filter_size + j];
385  size_t idx = idx_base;
386  if (op->type == SWS_PIXEL_U8) {
387  /* Interleave the pixels within each lane, i.e.:
388  * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0)
389  * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1)
390  * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0)
391  * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1)
392  * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0)
393  * ...
394  * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1)
395  * (repeat for taps 4-7, etc.)
396  */
397  const int gather_base = i & ~(gather_size - 1);
398  const int gather_pos = i - gather_base;
399  const int lane_idx = gather_pos >> 2;
400  const int pos_in_lane = gather_pos & 3;
401  idx += gather_base * 4 /* which gather (m0 or m1) */
402  + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
403  + lane_idx * 8 /* 8 ints per lane */
404  + (pos_in_lane & 1) * 4; /* 4 taps per pair */
405  } else {
406  idx += i * taps_align;
407  }
408 
409  switch (op->type) {
410  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
411  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
412  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
413  }
414  }
415  }
416  }
417 
418  out->priv.ptr = weights.ptr;
419  out->priv.uptr[1] = aligned_size;
420  out->free = ff_op_priv_free;
421  return 0;
422 }
423 
424 static bool check_filter_4x4_h(const SwsImplParams *params)
425 {
426  SwsContext *ctx = params->ctx;
427  const SwsOp *op = params->op;
428  if ((ctx->flags & SWS_BITEXACT) && op->type == SWS_PIXEL_F32)
429  return false; /* different accumulation order due to 4x4 transpose */
430 
431  const int cpu_flags = av_get_cpu_flags();
433  return true; /* always prefer over gathers if gathers are slow */
434 
435  /**
436  * Otherwise, prefer it above a certain filter size. Empirically, this
437  * kernel seems to be faster whenever the reference/gather kernel crosses
438  * a breakpoint for the number of gathers needed, but this filter doesn't.
439  *
440  * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
441  */
442  const SwsFilterWeights *filter = op->rw.kernel;
443  return op->type == SWS_PIXEL_U8 && filter->filter_size > 12 ||
444  op->type == SWS_PIXEL_U16 && filter->filter_size > 4 ||
445  op->type == SWS_PIXEL_F32 && filter->filter_size > 1;
446 }
447 
449 {
450  const SwsOp *op = params->op;
451  const SwsFilterWeights *filter = op->rw.kernel;
452  const int sizeof_weights = hscale_sizeof_weight(op);
453  const int block_size = params->table->block_size;
454  const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
455  const int pixels_align = 4; /* pixels per iteration */
456  const int filter_size = filter->filter_size;
457  const size_t aligned_size = FFALIGN(filter_size, taps_align);
458  const int line_size = FFALIGN(filter->dst_size, block_size);
459  av_assert1(FFALIGN(line_size, pixels_align) == line_size);
460 
461  union {
462  void *ptr;
463  int16_t *i16;
464  float *f32;
465  } weights;
466 
467  weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
468  if (!weights.ptr)
469  return AVERROR(ENOMEM);
470 
471  /**
472  * Desired memory layout: [w][taps][pixels_align][taps_align]
473  *
474  * Example with taps_align=8, pixels_align=4:
475  * [a0, a1, ... a7] weights for pixel 0, taps 0..7
476  * [b0, b1, ... b7] weights for pixel 1, taps 0..7
477  * [c0, c1, ... c7] weights for pixel 2, taps 0..7
478  * [d0, d1, ... d7] weights for pixel 3, taps 0..7
479  * [a8, a9, ... a15] weights for pixel 0, taps 8..15
480  * ...
481  * repeat for all taps, then move on to pixels 4..7, etc.
482  */
483  for (int x = 0; x < filter->dst_size; x++) {
484  for (int j = 0; j < filter_size; j++) {
485  const int xb = x & ~(pixels_align - 1);
486  const int jb = j & ~(taps_align - 1);
487  const int xi = x - xb, ji = j - jb;
488  const int w = filter->weights[x * filter_size + j];
489  const int idx = xb * aligned_size + jb * pixels_align + xi * taps_align + ji;
490 
491  switch (op->type) {
492  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
493  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
494  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
495  }
496  }
497  }
498 
499  out->priv.ptr = weights.ptr;
500  out->priv.uptr[1] = aligned_size * sizeof_weights;
501  out->free = ff_op_priv_free;
502  return 0;
503 }
504 
505 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \
506  DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT, \
507  .op = SWS_OP_READ, \
508  .rw.elems = ELEMS, \
509  .rw.filter = SWS_OP_FILTER_##DIR, \
510  __VA_ARGS__ \
511  );
512 
513 #define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...) \
514  DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__) \
515  DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__) \
516  DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__) \
517  DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__)
518 
519 #define DECL_FILTERS_GENERIC(EXT, TYPE) \
520  DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \
521  DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \
522  .check = check_filter_fma) \
523  DECL_FILTERS(EXT, TYPE, H, filter_h, .setup = setup_filter_h) \
524  DECL_FILTERS(EXT, TYPE, H, filter_4x4_h, .setup = setup_filter_4x4_h, \
525  .check = check_filter_4x4_h)
526 
527 #define REF_FILTERS(NAME, SUFFIX) \
528  &op_##NAME##1##SUFFIX, \
529  &op_##NAME##2##SUFFIX, \
530  &op_##NAME##3##SUFFIX, \
531  &op_##NAME##4##SUFFIX
532 
533 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
534  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
535  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
536  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
537  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
538  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
539  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
540  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
541  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
542  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
543  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
544  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
545  DECL_EXPAND_BITS(EXT, 8) \
546  DECL_PACKED_RW(EXT, 8) \
547  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
548  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
549  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
550  void ff_p1000_shuffle##EXT(void); \
551  void ff_p1001_shuffle##EXT(void); \
552  void ff_p1110_shuffle##EXT(void); \
553  void ff_p1111_shuffle##EXT(void); \
554  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
555  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
556  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
557  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
558  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
559  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
560  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
561  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
562  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
563  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
564  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
565  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
566  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
567  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
568  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
569  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
570  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
571  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
572  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
573  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
574  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
575  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
576  DECL_CLEAR_ALPHA(EXT, 0) \
577  DECL_CLEAR_ALPHA(EXT, 1) \
578  DECL_CLEAR_ALPHA(EXT, 3) \
579  DECL_CLEAR_ZERO(EXT, 0) \
580  DECL_CLEAR_ZERO(EXT, 1) \
581  DECL_CLEAR_ZERO(EXT, 3) \
582  DECL_CLEAR(EXT, 1, 1, 1, 0) \
583  DECL_CLEAR(EXT, 0, 1, 1, 1) \
584  DECL_CLEAR(EXT, 0, 0, 1, 1) \
585  DECL_CLEAR(EXT, 1, 0, 0, 1) \
586  DECL_CLEAR(EXT, 1, 1, 0, 0) \
587  DECL_CLEAR(EXT, 0, 1, 0, 1) \
588  DECL_CLEAR(EXT, 1, 0, 1, 0) \
589  DECL_CLEAR(EXT, 1, 0, 0, 0) \
590  DECL_CLEAR(EXT, 0, 1, 0, 0) \
591  DECL_CLEAR(EXT, 0, 0, 1, 0) \
592  \
593 static const SwsOpTable ops8##EXT = { \
594  .cpu_flags = AV_CPU_FLAG_##FLAG, \
595  .block_size = SIZE, \
596  .entries = { \
597  &op_read_planar1##EXT, \
598  &op_read_planar2##EXT, \
599  &op_read_planar3##EXT, \
600  &op_read_planar4##EXT, \
601  &op_write_planar1##EXT, \
602  &op_write_planar2##EXT, \
603  &op_write_planar3##EXT, \
604  &op_write_planar4##EXT, \
605  &op_read8_packed2##EXT, \
606  &op_read8_packed3##EXT, \
607  &op_read8_packed4##EXT, \
608  &op_write8_packed2##EXT, \
609  &op_write8_packed3##EXT, \
610  &op_write8_packed4##EXT, \
611  &op_read_nibbles1##EXT, \
612  &op_read_bits1##EXT, \
613  &op_write_bits1##EXT, \
614  &op_expand_bits8##EXT, \
615  &op_pack_1210##EXT, \
616  &op_pack_3320##EXT, \
617  &op_pack_2330##EXT, \
618  &op_unpack_1210##EXT, \
619  &op_unpack_3320##EXT, \
620  &op_unpack_2330##EXT, \
621  &op_swizzle_3012##EXT, \
622  &op_swizzle_3021##EXT, \
623  &op_swizzle_2103##EXT, \
624  &op_swizzle_3210##EXT, \
625  &op_swizzle_3102##EXT, \
626  &op_swizzle_3201##EXT, \
627  &op_swizzle_1203##EXT, \
628  &op_swizzle_1023##EXT, \
629  &op_swizzle_2013##EXT, \
630  &op_swizzle_2310##EXT, \
631  &op_swizzle_2130##EXT, \
632  &op_swizzle_1230##EXT, \
633  &op_swizzle_1320##EXT, \
634  &op_swizzle_0213##EXT, \
635  &op_swizzle_0231##EXT, \
636  &op_swizzle_0312##EXT, \
637  &op_swizzle_3120##EXT, \
638  &op_swizzle_0321##EXT, \
639  &op_swizzle_0003##EXT, \
640  &op_swizzle_0001##EXT, \
641  &op_swizzle_3000##EXT, \
642  &op_swizzle_1000##EXT, \
643  &op_clear_alpha0##EXT, \
644  &op_clear_alpha1##EXT, \
645  &op_clear_alpha3##EXT, \
646  &op_clear_zero0##EXT, \
647  &op_clear_zero1##EXT, \
648  &op_clear_zero3##EXT, \
649  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
650  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
651  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
652  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
653  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
654  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
655  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
656  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
657  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
658  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
659  NULL \
660  }, \
661 };
662 
663 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
664  DECL_PACKED_RW(EXT, 16) \
665  DECL_EXPAND_BITS(EXT, 16) \
666  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
667  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
668  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
669  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
670  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
671  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
672  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
673  DECL_SHIFT16(EXT) \
674  DECL_CONVERT(EXT, U8, U16) \
675  DECL_CONVERT(EXT, U16, U8) \
676  DECL_EXPAND(EXT, U8, U16) \
677  \
678 static const SwsOpTable ops16##EXT = { \
679  .cpu_flags = AV_CPU_FLAG_##FLAG, \
680  .block_size = SIZE, \
681  .entries = { \
682  &op_read16_packed2##EXT, \
683  &op_read16_packed3##EXT, \
684  &op_read16_packed4##EXT, \
685  &op_write16_packed2##EXT, \
686  &op_write16_packed3##EXT, \
687  &op_write16_packed4##EXT, \
688  &op_pack_4440##EXT, \
689  &op_pack_5550##EXT, \
690  &op_pack_5650##EXT, \
691  &op_unpack_4440##EXT, \
692  &op_unpack_5550##EXT, \
693  &op_unpack_5650##EXT, \
694  &op_expand_bits16##EXT, \
695  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
696  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
697  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
698  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
699  REF_COMMON_PATTERNS(lshift16##EXT), \
700  REF_COMMON_PATTERNS(rshift16##EXT), \
701  NULL \
702  }, \
703 };
704 
705 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
706  DECL_PACKED_RW(_m2##EXT, 32) \
707  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
708  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
709  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
710  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
711  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
712  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
713  DECL_CONVERT(EXT, U8, U32) \
714  DECL_CONVERT(EXT, U32, U8) \
715  DECL_CONVERT(EXT, U16, U32) \
716  DECL_CONVERT(EXT, U32, U16) \
717  DECL_CONVERT(EXT, U8, F32) \
718  DECL_CONVERT(EXT, F32, U8) \
719  DECL_CONVERT(EXT, U16, F32) \
720  DECL_CONVERT(EXT, F32, U16) \
721  DECL_EXPAND(EXT, U8, U32) \
722  DECL_MIN_MAX(EXT) \
723  DECL_SCALE(EXT) \
724  DECL_DITHER(DECL_COMMON_PATTERNS, EXT, 0) \
725  DECL_DITHER(DECL_ASM, EXT, 1) \
726  DECL_DITHER(DECL_ASM, EXT, 2) \
727  DECL_DITHER(DECL_ASM, EXT, 3) \
728  DECL_DITHER(DECL_ASM, EXT, 4) \
729  DECL_DITHER(DECL_ASM, EXT, 5) \
730  DECL_DITHER(DECL_ASM, EXT, 6) \
731  DECL_DITHER(DECL_ASM, EXT, 7) \
732  DECL_DITHER(DECL_ASM, EXT, 8) \
733  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
734  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
735  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
736  DECL_LINEAR(EXT, dot3, 0x7) \
737  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
738  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
739  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
740  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
741  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
742  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
743  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
744  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
745  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
746  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
747  DECL_FILTERS_GENERIC(EXT, U8) \
748  DECL_FILTERS_GENERIC(EXT, U16) \
749  DECL_FILTERS_GENERIC(EXT, F32) \
750  \
751 static const SwsOpTable ops32##EXT = { \
752  .cpu_flags = AV_CPU_FLAG_##FLAG, \
753  .block_size = SIZE, \
754  .entries = { \
755  &op_read32_packed2_m2##EXT, \
756  &op_read32_packed3_m2##EXT, \
757  &op_read32_packed4_m2##EXT, \
758  &op_write32_packed2_m2##EXT, \
759  &op_write32_packed3_m2##EXT, \
760  &op_write32_packed4_m2##EXT, \
761  &op_pack_1010102_m2##EXT, \
762  &op_pack_2101010_m2##EXT, \
763  &op_unpack_1010102_m2##EXT, \
764  &op_unpack_2101010_m2##EXT, \
765  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
766  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
767  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
768  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
769  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
770  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
771  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
772  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
773  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
774  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
775  REF_COMMON_PATTERNS(min##EXT), \
776  REF_COMMON_PATTERNS(max##EXT), \
777  REF_COMMON_PATTERNS(scale##EXT), \
778  REF_COMMON_PATTERNS(dither0##EXT), \
779  &op_dither1##EXT, \
780  &op_dither2##EXT, \
781  &op_dither3##EXT, \
782  &op_dither4##EXT, \
783  &op_dither5##EXT, \
784  &op_dither6##EXT, \
785  &op_dither7##EXT, \
786  &op_dither8##EXT, \
787  &op_luma##EXT, \
788  &op_alpha##EXT, \
789  &op_lumalpha##EXT, \
790  &op_dot3##EXT, \
791  &op_row0##EXT, \
792  &op_row0a##EXT, \
793  &op_diag3##EXT, \
794  &op_diag4##EXT, \
795  &op_diagoff3##EXT, \
796  &op_matrix3##EXT, \
797  &op_affine3##EXT, \
798  &op_affine3a##EXT, \
799  &op_matrix4##EXT, \
800  &op_affine4##EXT, \
801  REF_FILTERS(filter_fma_v, _U8##EXT), \
802  REF_FILTERS(filter_fma_v, _U16##EXT), \
803  REF_FILTERS(filter_fma_v, _F32##EXT), \
804  REF_FILTERS(filter_4x4_h, _U8##EXT), \
805  REF_FILTERS(filter_4x4_h, _U16##EXT), \
806  REF_FILTERS(filter_4x4_h, _F32##EXT), \
807  REF_FILTERS(filter_v, _U8##EXT), \
808  REF_FILTERS(filter_v, _U16##EXT), \
809  REF_FILTERS(filter_v, _F32##EXT), \
810  REF_FILTERS(filter_h, _U8##EXT), \
811  REF_FILTERS(filter_h, _U16##EXT), \
812  REF_FILTERS(filter_h, _F32##EXT), \
813  NULL \
814  }, \
815 };
816 
817 DECL_FUNCS_8(16, _m1_sse4, SSE4)
818 DECL_FUNCS_8(32, _m1_avx2, AVX2)
819 DECL_FUNCS_8(32, _m2_sse4, SSE4)
820 DECL_FUNCS_8(64, _m2_avx2, AVX2)
821 
822 DECL_FUNCS_16(16, _m1_avx2, AVX2)
823 DECL_FUNCS_16(32, _m2_avx2, AVX2)
824 
825 DECL_FUNCS_32(16, _avx2, AVX2)
826 
827 static const SwsOpTable *const tables[] = {
828  &ops8_m1_sse4,
829  &ops8_m1_avx2,
830  &ops8_m2_sse4,
831  &ops8_m2_avx2,
832  &ops16_m1_avx2,
833  &ops16_m2_avx2,
834  &ops32_avx2,
835 };
836 
837 static av_const int get_mmsize(const int cpu_flags)
838 {
840  return 64;
841  else if (cpu_flags & AV_CPU_FLAG_AVX2)
842  return 32;
843  else if (cpu_flags & AV_CPU_FLAG_SSE4)
844  return 16;
845  else
846  return AVERROR(ENOTSUP);
847 }
848 
849 /**
850  * Returns true if the operation's implementation only depends on the block
851  * size, and not the underlying pixel type
852  */
853 static bool op_is_type_invariant(const SwsOp *op)
854 {
855  switch (op->op) {
856  case SWS_OP_READ:
857  case SWS_OP_WRITE:
858  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac && !op->rw.filter;
859  case SWS_OP_SWIZZLE:
860  case SWS_OP_CLEAR:
861  return true;
862  }
863 
864  return false;
865 }
866 
867 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
868 {
869  uint8_t shuffle[16];
870  int read_bytes, write_bytes;
871  int pixels;
872 
873  /* Solve the shuffle mask for one 128-bit lane only */
874  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
875  if (pixels < 0)
876  return pixels;
877 
878  /* We can't shuffle acress lanes, so restrict the vector size to XMM
879  * whenever the read/write size would be a subset of the full vector */
880  if (read_bytes < 16 || write_bytes < 16)
881  mmsize = 16;
882 
883  const int num_lanes = mmsize / 16;
884  const int in_total = num_lanes * read_bytes;
885  const int out_total = num_lanes * write_bytes;
886  const int read_size = in_total <= 4 ? 4 : /* movd */
887  in_total <= 8 ? 8 : /* movq */
888  mmsize; /* movu */
889 
890  *out = (SwsCompiledOp) {
891  .priv = av_memdup(shuffle, sizeof(shuffle)),
892  .free = av_free,
893  .slice_align = 1,
894  .block_size = pixels * num_lanes,
895  .over_read = read_size - in_total,
896  .over_write = mmsize - out_total,
897  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
898  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
900  };
901 
902  if (!out->priv)
903  return AVERROR(ENOMEM);
904 
905 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
906 do { \
907  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
908  if (in_total == IN && out_total == OUT) \
909  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
910 } while (0)
911 
912  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
913  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
914  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
915  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
916  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
917  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
918  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
919  ASSIGN_SHUFFLE_FUNC(15, 5, sse4);
920  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
921  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
922  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
923  ASSIGN_SHUFFLE_FUNC(16, 4, sse4);
924  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
925  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
926  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
927  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
928  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
929  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
930  av_assert1(out->func);
931  return 0;
932 }
933 
934 /* Normalize clear values into 32-bit integer constants */
935 static void normalize_clear(SwsOp *op)
936 {
937  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
938  SwsImplResult res;
939  union {
940  uint32_t u32;
941  int i;
942  } c;
943 
944  ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, &res);
945 
946  for (int i = 0; i < 4; i++) {
947  if (!op->clear.value[i].den)
948  continue;
949  switch (ff_sws_pixel_type_size(op->type)) {
950  case 1: c.u32 = 0x1010101U * res.priv.u8[i]; break;
951  case 2: c.u32 = (uint32_t) res.priv.u16[i] << 16 | res.priv.u16[i]; break;
952  case 4: c.u32 = res.priv.u32[i]; break;
953  }
954 
955  op->clear.value[i].num = c.i;
956  op->clear.value[i].den = 1;
957  }
958 }
959 
961 {
962  int ret;
963  const int cpu_flags = av_get_cpu_flags();
964  const int mmsize = get_mmsize(cpu_flags);
965  if (mmsize < 0)
966  return mmsize;
967 
968  /* Special fast path for in-place packed shuffle */
969  ret = solve_shuffle(ops, mmsize, out);
970  if (ret != AVERROR(ENOTSUP))
971  return ret;
972 
974  if (!chain)
975  return AVERROR(ENOMEM);
976 
977  *out = (SwsCompiledOp) {
978  .priv = chain,
979  .slice_align = 1,
981 
982  /* Use at most two full YMM regs during the widest precision section */
983  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
984  };
985 
986  for (int i = 0; i < ops->num_ops; i++) {
987  int op_block_size = out->block_size;
988  SwsOp *op = &ops->ops[i];
989 
990  if (op_is_type_invariant(op)) {
991  if (op->op == SWS_OP_CLEAR)
993  op_block_size *= ff_sws_pixel_type_size(op->type);
994  op->type = SWS_PIXEL_U8;
995  }
996 
998  ops, i, op_block_size, chain);
999  if (ret < 0) {
1000  av_log(ctx, AV_LOG_TRACE, "Failed to compile op %d\n", i);
1001  ff_sws_op_chain_free(chain);
1002  return ret;
1003  }
1004  }
1005 
1006 #define ASSIGN_PROCESS_FUNC(NAME) \
1007  do { \
1008  SWS_DECL_FUNC(NAME); \
1009  out->func = NAME; \
1010  } while (0)
1011 
1012  const SwsOp *read = ff_sws_op_list_input(ops);
1013  const SwsOp *write = ff_sws_op_list_output(ops);
1014  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
1015  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
1016  switch (FFMAX(read_planes, write_planes)) {
1017  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
1018  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
1019  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
1020  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
1021  }
1022 
1023  if (ret < 0) {
1024  ff_sws_op_chain_free(chain);
1025  return ret;
1026  }
1027 
1028  out->cpu_flags = chain->cpu_flags;
1029  out->over_read = chain->over_read;
1030  out->over_write = chain->over_write;
1031  return 0;
1032 }
1033 
1035  .name = "x86",
1036  .compile = compile,
1037  .hw_format = AV_PIX_FMT_NONE,
1038 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:50
SwsOpTable
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:159
check_filter_fma
static bool check_filter_fma(const SwsImplParams *params)
Definition: ops.c:287
SWS_PIXEL_U16
@ SWS_PIXEL_U16
Definition: ops.h:36
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:53
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ff_sws_setup_clear
int ff_sws_setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:306
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:837
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:643
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:62
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:720
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:1034
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:223
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:935
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:64
av_const
#define av_const
Definition: attributes.h:105
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:157
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:705
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:263
SwsOpBackend::name
const char * name
Definition: ops_internal.h:56
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:76
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpTable::block_size
int block_size
Definition: ops_chain.h:161
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:54
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:211
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: ops.h:38
SwsOpList::num_ops
int num_ops
Definition: ops.h:265
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:35
ff_sws_pixel_type_is_int
bool ff_sws_pixel_type_is_int(SwsPixelType type)
Definition: ops.c:91
AVRational::num
int num
Numerator.
Definition: rational.h:59
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:62
SwsOpChain::over_read
int over_read
Definition: ops_chain.h:90
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
Q
#define Q(q)
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(SwsOpPriv *)
Definition: ops_chain.h:87
avassert.h
AV_LOG_TRACE
#define AV_LOG_TRACE
Extremely verbose debugging, useful for libav* development.
Definition: log.h:236
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
AVFormatContext::flags
int flags
Flags modifying the (de)muxer behaviour.
Definition: avformat.h:1414
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
setup_clear
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:132
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:652
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:55
SwsReadWriteOp::kernel
SwsFilterWeights * kernel
Definition: ops.h:120
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
SwsImplParams::op
const SwsOp * op
Definition: ops_chain.h:107
tables
static const SwsOpTable *const tables[]
Definition: ops.c:827
check_filter_4x4_h
static bool check_filter_4x4_h(const SwsImplParams *params)
Definition: ops.c:424
setup_rw
static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:61
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:867
setup_filter_4x4_h
static int setup_filter_4x4_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:448
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
SwsImplParams
Definition: ops_chain.h:105
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOp::type
SwsPixelType type
Definition: ops.h:220
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:50
size
int size
Definition: twinvq_data.h:10344
setup_swap_bytes
static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:102
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, int ops_index, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:205
SwsShiftOp::amount
uint8_t amount
Definition: ops.h:146
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:51
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:52
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:960
SwsImplParams::ctx
SwsContext * ctx
Definition: ops_chain.h:108
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SwsOpList::ops
SwsOp * ops
Definition: ops.h:264
weights
static const int weights[]
Definition: hevc_pel.c:32
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:853
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
hscale_sizeof_weight
static int hscale_sizeof_weight(const SwsOp *op)
Definition: ops.c:332
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:218
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:149
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_dispatch.h:100
setup_shift
static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:166
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
SwsImplResult::priv
SwsOpPriv priv
Definition: ops_chain.h:113
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:110
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:779
SwsOp::shift
SwsShiftOp shift
Definition: ops.h:226
av_mul_q
AVRational av_mul_q(AVRational b, AVRational c)
Multiply two rationals.
Definition: rational.c:80
SwsReadWriteOp::elems
uint8_t elems
Examples: rgba = 4x u8 packed yuv444p = 3x u8 rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack monow = ...
Definition: ops.h:108
mem.h
w
uint8_t w
Definition: llvidencdsp.c:39
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:278
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:312
int32_t
int32_t
Definition: audioconvert.c:56
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:663
stride
#define stride
Definition: h264pred_template.c:536
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h264.c:190
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:263
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:533
SwsContext
Main external API structure.
Definition: swscale.h:206
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
SwsImplResult
Definition: ops_chain.h:111
SwsImplParams::table
const SwsOpTable * table
Definition: ops_chain.h:106
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
SwsOpChain::over_write
int over_write
Definition: ops_chain.h:91
min
float min
Definition: vorbis_enc_data.h:429
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:342