FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, MASK, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  .mask = MASK, \
32  __VA_ARGS__ \
33  }
34 
35 #define DECL_ASM(TYPE, MASK, NAME, ...) \
36  void ff_##NAME(void); \
37  DECL_ENTRY(TYPE, MASK, NAME, \
38  .func = ff_##NAME, \
39  __VA_ARGS__)
40 
41 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
42  DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), p##X##Y##Z##W##_##NAME, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
62 {
63  const SwsOp *op = params->op;
64 
65  /* 3-component reads/writes process one extra garbage word */
66  if (op->rw.packed && op->rw.elems == 3) {
67  switch (op->op) {
68  case SWS_OP_READ: out->over_read = sizeof(uint32_t); break;
69  case SWS_OP_WRITE: out->over_write = sizeof(uint32_t); break;
70  }
71  }
72 
73  return 0;
74 }
75 
76 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
77  DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##EXT, \
78  .op = SWS_OP_##OP, \
79  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
80  .setup = setup_rw, \
81  );
82 
83 #define DECL_PACKED_RW(EXT, DEPTH) \
84  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
85  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
86  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
87  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
88  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
89  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
90 
91 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
92  DECL_ASM(TYPE, SWS_COMP(0), pack_##X##Y##Z##W##EXT, \
93  .op = SWS_OP_PACK, \
94  .pack.pattern = {X, Y, Z, W}, \
95  ); \
96  \
97  DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), unpack_##X##Y##Z##W##EXT, \
98  .op = SWS_OP_UNPACK, \
99  .pack.pattern = {X, Y, Z, W}, \
100  ); \
101 
102 static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
103 {
104  const int mask = ff_sws_pixel_type_size(params->op->type) - 1;
105  for (int i = 0; i < 16; i++)
106  out->priv.u8[i] = (i & ~mask) | (mask - (i & mask));
107  return 0;
108 }
109 
110 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
111  DECL_ENTRY(TYPE, SWS_COMP_MASK(X, Y, Z, W), \
112  p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
113  .op = SWS_OP_SWAP_BYTES, \
114  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
115  .setup = setup_swap_bytes, \
116  );
117 
118 #define DECL_CLEAR_ALPHA(EXT, IDX) \
119  DECL_ASM(U8, SWS_COMP_ALL, clear_alpha##IDX##EXT, \
120  .op = SWS_OP_CLEAR, \
121  .clear.mask = SWS_COMP(IDX), \
122  .clear.value[IDX] = { -1, 1 }, \
123  ); \
124 
125 #define DECL_CLEAR_ZERO(EXT, IDX) \
126  DECL_ASM(U8, SWS_COMP_ALL, clear_zero##IDX##EXT, \
127  .op = SWS_OP_CLEAR, \
128  .clear.mask = SWS_COMP(IDX), \
129  .clear.value[IDX] = { 0, 1 }, \
130  );
131 
132 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
133 {
134  const SwsOp *op = params->op;
135  for (int i = 0; i < 4; i++)
136  out->priv.u32[i] = (uint32_t) op->clear.value[i].num;
137  return 0;
138 }
139 
140 #define DECL_CLEAR(EXT, X, Y, Z, W) \
141  DECL_ASM(U8, SWS_COMP_ALL, p##X##Y##Z##W##_clear##EXT, \
142  .op = SWS_OP_CLEAR, \
143  .setup = setup_clear, \
144  .clear.mask = SWS_COMP_MASK(X, Y, Z, W), \
145  );
146 
147 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
148  DECL_ASM(U8, SWS_COMP_ALL, swizzle_##X##Y##Z##W##EXT, \
149  .op = SWS_OP_SWIZZLE, \
150  .swizzle.in = {X, Y, Z, W}, \
151  );
152 
153 #define DECL_CONVERT(EXT, FROM, TO) \
154  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
155  .op = SWS_OP_CONVERT, \
156  .convert.to = SWS_PIXEL_##TO, \
157  );
158 
159 #define DECL_EXPAND(EXT, FROM, TO) \
160  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
161  .op = SWS_OP_CONVERT, \
162  .convert.to = SWS_PIXEL_##TO, \
163  .convert.expand = true, \
164  );
165 
166 static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
167 {
168  out->priv.u16[0] = params->op->shift.amount;
169  return 0;
170 }
171 
172 #define DECL_SHIFT16(EXT) \
173  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
174  .op = SWS_OP_LSHIFT, \
175  .setup = setup_shift, \
176  .flexible = true, \
177  ); \
178  \
179  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
180  .op = SWS_OP_RSHIFT, \
181  .setup = setup_shift, \
182  .flexible = true, \
183  );
184 
185 #define DECL_MIN_MAX(EXT) \
186  DECL_COMMON_PATTERNS(F32, min##EXT, \
187  .op = SWS_OP_MIN, \
188  .setup = ff_sws_setup_clamp, \
189  ); \
190  \
191  DECL_COMMON_PATTERNS(F32, max##EXT, \
192  .op = SWS_OP_MAX, \
193  .setup = ff_sws_setup_clamp, \
194  );
195 
196 #define DECL_SCALE(EXT) \
197  DECL_COMMON_PATTERNS(F32, scale##EXT, \
198  .op = SWS_OP_SCALE, \
199  .setup = ff_sws_setup_scale, \
200  .flexible = true, \
201  );
202 
203 #define DECL_EXPAND_BITS(EXT, BITS) \
204  DECL_ASM(U##BITS, SWS_COMP(0), expand_bits##BITS##EXT, \
205  .op = SWS_OP_SCALE, \
206  .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
207  );
208 
209 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
210 {
211  const SwsOp *op = params->op;
212  /* 1x1 matrix / single constant */
213  if (!op->dither.size_log2) {
214  const AVRational k = op->dither.matrix[0];
215  out->priv.f32[0] = (float) k.num / k.den;
216  return 0;
217  }
218 
219  const int size = 1 << op->dither.size_log2;
220  const int8_t *off = op->dither.y_offset;
221  int max_offset = 0;
222  for (int i = 0; i < 4; i++) {
223  if (off[i] >= 0)
224  max_offset = FFMAX(max_offset, off[i] & (size - 1));
225  }
226 
227  /* Allocate extra rows to allow over-reading for row offsets. Note that
228  * max_offset is currently never larger than 5, so the extra space needed
229  * for this over-allocation is bounded by 5 * size * sizeof(float),
230  * typically 320 bytes for a 16x16 dither matrix. */
231  const int stride = size * sizeof(float);
232  const int num_rows = size + max_offset;
233  float *matrix = out->priv.ptr = av_mallocz(num_rows * stride);
234  if (!matrix)
235  return AVERROR(ENOMEM);
236  out->free = ff_op_priv_free;
237 
238  for (int i = 0; i < size * size; i++)
239  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
240 
241  memcpy(&matrix[size * size], matrix, max_offset * stride);
242 
243  /* Store relative pointer offset to each row inside extra space */
244  static_assert(sizeof(out->priv.ptr) <= sizeof(int16_t[4]),
245  ">8 byte pointers not supported");
246  assert(max_offset * stride <= INT16_MAX);
247  int16_t *off_out = &out->priv.i16[4];
248  for (int i = 0; i < 4; i++)
249  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
250 
251  return 0;
252 }
253 
254 #define DECL_DITHER0(EXT) \
255  DECL_COMMON_PATTERNS(F32, dither0##EXT, \
256  .op = SWS_OP_DITHER, \
257  .setup = setup_dither, \
258  );
259 
260 #define DECL_DITHER(EXT, SIZE) \
261  DECL_ASM(F32, SWS_COMP_ALL, dither##SIZE##EXT, \
262  .op = SWS_OP_DITHER, \
263  .setup = setup_dither, \
264  .dither_size = SIZE, \
265  );
266 
267 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
268 {
269  const SwsOp *op = params->op;
270 
271  float *matrix = out->priv.ptr = av_mallocz(sizeof(float[4][5]));
272  if (!matrix)
273  return AVERROR(ENOMEM);
274  out->free = ff_op_priv_free;
275 
276  for (int y = 0; y < 4; y++) {
277  for (int x = 0; x < 5; x++)
278  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
279  }
280 
281  return 0;
282 }
283 
284 #define DECL_LINEAR(EXT, NAME, MASK) \
285  DECL_ASM(F32, SWS_COMP_ALL, NAME##EXT, \
286  .op = SWS_OP_LINEAR, \
287  .setup = setup_linear, \
288  .linear_mask = (MASK), \
289  );
290 
291 static bool check_filter_fma(const SwsImplParams *params)
292 {
293  const SwsOp *op = params->op;
294  SwsContext *ctx = params->ctx;
295  if (!(ctx->flags & SWS_BITEXACT))
296  return true;
297 
298  if (!ff_sws_pixel_type_is_int(op->type))
299  return false;
300 
301  /* Check if maximum/minimum partial sum fits losslessly inside float */
302  AVRational max_range = { 1 << 24, 1 };
303  AVRational min_range = { -(1 << 24), 1 };
305 
306  for (int i = 0; i < op->rw.elems; i++) {
307  const AVRational min = av_mul_q(op->comps.min[i], scale);
308  const AVRational max = av_mul_q(op->comps.max[i], scale);
309  if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0)
310  return false;
311  }
312 
313  return true;
314 }
315 
316 static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
317 {
318  const SwsFilterWeights *filter = params->op->rw.kernel;
319  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
320  ">8 byte pointers not supported");
321 
322  /* Pre-convert weights to float */
323  float *weights = av_calloc(filter->num_weights, sizeof(float));
324  if (!weights)
325  return AVERROR(ENOMEM);
326 
327  for (int i = 0; i < filter->num_weights; i++)
328  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
329 
330  out->priv.ptr = weights;
331  out->priv.uptr[1] = filter->filter_size;
332  out->free = ff_op_priv_free;
333  return 0;
334 }
335 
336 static int hscale_sizeof_weight(const SwsOp *op)
337 {
338  switch (op->type) {
339  case SWS_PIXEL_U8: return sizeof(int16_t);
340  case SWS_PIXEL_U16: return sizeof(int16_t);
341  case SWS_PIXEL_F32: return sizeof(float);
342  default: return 0;
343  }
344 }
345 
346 static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
347 {
348  const SwsOp *op = params->op;
349  const SwsFilterWeights *filter = op->rw.kernel;
350 
351  /**
352  * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
353  * size, we need to gather 2/4 taps simultaneously and unroll the inner
354  * loop over several packed samples.
355  */
356  const int pixel_size = ff_sws_pixel_type_size(op->type);
357  const int taps_align = sizeof(int32_t) / pixel_size;
358  const int filter_size = filter->filter_size;
359  const int block_size = params->table->block_size;
360  const size_t aligned_size = FFALIGN(filter_size, taps_align);
361  const size_t line_size = FFALIGN(filter->dst_size, block_size);
362  av_assert1(FFALIGN(line_size, taps_align) == line_size);
363  if (aligned_size > INT_MAX)
364  return AVERROR(EINVAL);
365 
366  union {
367  void *ptr;
368  int16_t *i16;
369  float *f32;
370  } weights;
371 
372  const int sizeof_weight = hscale_sizeof_weight(op);
373  weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
374  if (!weights.ptr)
375  return AVERROR(ENOMEM);
376 
377  /**
378  * Transpose filter weights to group (aligned) taps by block
379  */
380  const int mmsize = block_size * 2;
381  const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */
382  for (size_t x = 0; x < line_size; x += block_size) {
383  const int elems = FFMIN(block_size, filter->dst_size - x);
384  for (int j = 0; j < filter_size; j++) {
385  const int jb = j & ~(taps_align - 1);
386  const int ji = j - jb;
387  const size_t idx_base = x * aligned_size + jb * block_size + ji;
388  for (int i = 0; i < elems; i++) {
389  const int w = filter->weights[(x + i) * filter_size + j];
390  size_t idx = idx_base;
391  if (op->type == SWS_PIXEL_U8) {
392  /* Interleave the pixels within each lane, i.e.:
393  * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0)
394  * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1)
395  * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0)
396  * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1)
397  * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0)
398  * ...
399  * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1)
400  * (repeat for taps 4-7, etc.)
401  */
402  const int gather_base = i & ~(gather_size - 1);
403  const int gather_pos = i - gather_base;
404  const int lane_idx = gather_pos >> 2;
405  const int pos_in_lane = gather_pos & 3;
406  idx += gather_base * 4 /* which gather (m0 or m1) */
407  + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
408  + lane_idx * 8 /* 8 ints per lane */
409  + (pos_in_lane & 1) * 4; /* 4 taps per pair */
410  } else {
411  idx += i * taps_align;
412  }
413 
414  switch (op->type) {
415  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
416  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
417  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
418  }
419  }
420  }
421  }
422 
423  out->priv.ptr = weights.ptr;
424  out->priv.uptr[1] = aligned_size;
425  out->free = ff_op_priv_free;
426  out->over_read = (aligned_size - filter_size) * pixel_size;
427  return 0;
428 }
429 
430 static bool check_filter_4x4_h(const SwsImplParams *params)
431 {
432  SwsContext *ctx = params->ctx;
433  const SwsOp *op = params->op;
434  if ((ctx->flags & SWS_BITEXACT) && op->type == SWS_PIXEL_F32)
435  return false; /* different accumulation order due to 4x4 transpose */
436 
437  const int cpu_flags = av_get_cpu_flags();
439  return true; /* always prefer over gathers if gathers are slow */
440 
441  /**
442  * Otherwise, prefer it above a certain filter size. Empirically, this
443  * kernel seems to be faster whenever the reference/gather kernel crosses
444  * a breakpoint for the number of gathers needed, but this filter doesn't.
445  *
446  * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
447  */
448  const SwsFilterWeights *filter = op->rw.kernel;
449  return op->type == SWS_PIXEL_U8 && filter->filter_size > 12 ||
450  op->type == SWS_PIXEL_U16 && filter->filter_size > 4 ||
451  op->type == SWS_PIXEL_F32 && filter->filter_size > 1;
452 }
453 
455 {
456  const SwsOp *op = params->op;
457  const SwsFilterWeights *filter = op->rw.kernel;
458  const int pixel_size = ff_sws_pixel_type_size(op->type);
459  const int sizeof_weights = hscale_sizeof_weight(op);
460  const int block_size = params->table->block_size;
461  const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
462  const int pixels_align = 4; /* pixels per iteration */
463  const int filter_size = filter->filter_size;
464  const size_t aligned_size = FFALIGN(filter_size, taps_align);
465  const int line_size = FFALIGN(filter->dst_size, block_size);
466  av_assert1(FFALIGN(line_size, pixels_align) == line_size);
467 
468  union {
469  void *ptr;
470  int16_t *i16;
471  float *f32;
472  } weights;
473 
474  weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
475  if (!weights.ptr)
476  return AVERROR(ENOMEM);
477 
478  /**
479  * Desired memory layout: [w][taps][pixels_align][taps_align]
480  *
481  * Example with taps_align=8, pixels_align=4:
482  * [a0, a1, ... a7] weights for pixel 0, taps 0..7
483  * [b0, b1, ... b7] weights for pixel 1, taps 0..7
484  * [c0, c1, ... c7] weights for pixel 2, taps 0..7
485  * [d0, d1, ... d7] weights for pixel 3, taps 0..7
486  * [a8, a9, ... a15] weights for pixel 0, taps 8..15
487  * ...
488  * repeat for all taps, then move on to pixels 4..7, etc.
489  */
490  for (int x = 0; x < filter->dst_size; x++) {
491  for (int j = 0; j < filter_size; j++) {
492  const int xb = x & ~(pixels_align - 1);
493  const int jb = j & ~(taps_align - 1);
494  const int xi = x - xb, ji = j - jb;
495  const int w = filter->weights[x * filter_size + j];
496  const int idx = xb * aligned_size + jb * pixels_align + xi * taps_align + ji;
497 
498  switch (op->type) {
499  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
500  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
501  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
502  }
503  }
504  }
505 
506  out->priv.ptr = weights.ptr;
507  out->priv.uptr[1] = aligned_size * sizeof_weights;
508  out->free = ff_op_priv_free;
509  out->over_read = (aligned_size - filter_size) * pixel_size;
510  return 0;
511 }
512 
513 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \
514  DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##_##TYPE##EXT, \
515  .op = SWS_OP_READ, \
516  .rw.elems = ELEMS, \
517  .rw.filter = SWS_OP_FILTER_##DIR, \
518  __VA_ARGS__ \
519  );
520 
521 #define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...) \
522  DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__) \
523  DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__) \
524  DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__) \
525  DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__)
526 
527 #define DECL_FILTERS_GENERIC(EXT, TYPE) \
528  DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \
529  DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \
530  .check = check_filter_fma) \
531  DECL_FILTERS(EXT, TYPE, H, filter_h, .setup = setup_filter_h) \
532  DECL_FILTERS(EXT, TYPE, H, filter_4x4_h, .setup = setup_filter_4x4_h, \
533  .check = check_filter_4x4_h)
534 
535 #define REF_FILTERS(NAME, SUFFIX) \
536  &op_##NAME##1##SUFFIX, \
537  &op_##NAME##2##SUFFIX, \
538  &op_##NAME##3##SUFFIX, \
539  &op_##NAME##4##SUFFIX
540 
541 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
542  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
543  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
544  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
545  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
546  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
547  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
548  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
549  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
550  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
551  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
552  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
553  DECL_EXPAND_BITS(EXT, 8) \
554  DECL_PACKED_RW(EXT, 8) \
555  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
556  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
557  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
558  void ff_p1000_shuffle##EXT(void); \
559  void ff_p1001_shuffle##EXT(void); \
560  void ff_p1110_shuffle##EXT(void); \
561  void ff_p1111_shuffle##EXT(void); \
562  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
563  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
564  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
565  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
566  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
567  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
568  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
569  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
570  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
571  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
572  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
573  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
574  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
575  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
576  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
577  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
578  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
579  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
580  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
581  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
582  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
583  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
584  DECL_CLEAR_ALPHA(EXT, 0) \
585  DECL_CLEAR_ALPHA(EXT, 1) \
586  DECL_CLEAR_ALPHA(EXT, 3) \
587  DECL_CLEAR_ZERO(EXT, 0) \
588  DECL_CLEAR_ZERO(EXT, 1) \
589  DECL_CLEAR_ZERO(EXT, 3) \
590  DECL_CLEAR(EXT, 0, 0, 0, 1) \
591  DECL_CLEAR(EXT, 1, 0, 0, 0) \
592  DECL_CLEAR(EXT, 1, 1, 0, 0) \
593  DECL_CLEAR(EXT, 0, 1, 1, 0) \
594  DECL_CLEAR(EXT, 0, 0, 1, 1) \
595  DECL_CLEAR(EXT, 1, 0, 1, 0) \
596  DECL_CLEAR(EXT, 0, 1, 0, 1) \
597  DECL_CLEAR(EXT, 0, 1, 1, 1) \
598  DECL_CLEAR(EXT, 1, 0, 1, 1) \
599  DECL_CLEAR(EXT, 1, 1, 0, 1) \
600  \
601 static const SwsOpTable ops8##EXT = { \
602  .cpu_flags = AV_CPU_FLAG_##FLAG, \
603  .block_size = SIZE, \
604  .entries = { \
605  &op_read_planar1##EXT, \
606  &op_read_planar2##EXT, \
607  &op_read_planar3##EXT, \
608  &op_read_planar4##EXT, \
609  &op_write_planar1##EXT, \
610  &op_write_planar2##EXT, \
611  &op_write_planar3##EXT, \
612  &op_write_planar4##EXT, \
613  &op_read8_packed2##EXT, \
614  &op_read8_packed3##EXT, \
615  &op_read8_packed4##EXT, \
616  &op_write8_packed2##EXT, \
617  &op_write8_packed3##EXT, \
618  &op_write8_packed4##EXT, \
619  &op_read_nibbles1##EXT, \
620  &op_read_bits1##EXT, \
621  &op_write_bits1##EXT, \
622  &op_expand_bits8##EXT, \
623  &op_pack_1210##EXT, \
624  &op_pack_3320##EXT, \
625  &op_pack_2330##EXT, \
626  &op_unpack_1210##EXT, \
627  &op_unpack_3320##EXT, \
628  &op_unpack_2330##EXT, \
629  &op_swizzle_3012##EXT, \
630  &op_swizzle_3021##EXT, \
631  &op_swizzle_2103##EXT, \
632  &op_swizzle_3210##EXT, \
633  &op_swizzle_3102##EXT, \
634  &op_swizzle_3201##EXT, \
635  &op_swizzle_1203##EXT, \
636  &op_swizzle_1023##EXT, \
637  &op_swizzle_2013##EXT, \
638  &op_swizzle_2310##EXT, \
639  &op_swizzle_2130##EXT, \
640  &op_swizzle_1230##EXT, \
641  &op_swizzle_1320##EXT, \
642  &op_swizzle_0213##EXT, \
643  &op_swizzle_0231##EXT, \
644  &op_swizzle_0312##EXT, \
645  &op_swizzle_3120##EXT, \
646  &op_swizzle_0321##EXT, \
647  &op_swizzle_0003##EXT, \
648  &op_swizzle_0001##EXT, \
649  &op_swizzle_3000##EXT, \
650  &op_swizzle_1000##EXT, \
651  &op_clear_alpha0##EXT, \
652  &op_clear_alpha1##EXT, \
653  &op_clear_alpha3##EXT, \
654  &op_clear_zero0##EXT, \
655  &op_clear_zero1##EXT, \
656  &op_clear_zero3##EXT, \
657  REF_PATTERN(clear##EXT, 0, 0, 0, 1), \
658  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
659  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
660  REF_PATTERN(clear##EXT, 0, 1, 1, 0), \
661  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
662  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
663  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
664  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
665  REF_PATTERN(clear##EXT, 1, 0, 1, 1), \
666  REF_PATTERN(clear##EXT, 1, 1, 0, 1), \
667  NULL \
668  }, \
669 };
670 
671 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
672  DECL_PACKED_RW(EXT, 16) \
673  DECL_EXPAND_BITS(EXT, 16) \
674  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
675  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
676  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
677  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
678  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
679  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
680  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
681  DECL_SHIFT16(EXT) \
682  DECL_CONVERT(EXT, U8, U16) \
683  DECL_CONVERT(EXT, U16, U8) \
684  DECL_EXPAND(EXT, U8, U16) \
685  \
686 static const SwsOpTable ops16##EXT = { \
687  .cpu_flags = AV_CPU_FLAG_##FLAG, \
688  .block_size = SIZE, \
689  .entries = { \
690  &op_read16_packed2##EXT, \
691  &op_read16_packed3##EXT, \
692  &op_read16_packed4##EXT, \
693  &op_write16_packed2##EXT, \
694  &op_write16_packed3##EXT, \
695  &op_write16_packed4##EXT, \
696  &op_pack_4440##EXT, \
697  &op_pack_5550##EXT, \
698  &op_pack_5650##EXT, \
699  &op_unpack_4440##EXT, \
700  &op_unpack_5550##EXT, \
701  &op_unpack_5650##EXT, \
702  &op_expand_bits16##EXT, \
703  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
704  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
705  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
706  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
707  REF_COMMON_PATTERNS(lshift16##EXT), \
708  REF_COMMON_PATTERNS(rshift16##EXT), \
709  NULL \
710  }, \
711 };
712 
713 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
714  DECL_PACKED_RW(_m2##EXT, 32) \
715  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
716  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
717  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
718  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
719  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
720  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
721  DECL_CONVERT(EXT, U8, U32) \
722  DECL_CONVERT(EXT, U32, U8) \
723  DECL_CONVERT(EXT, U16, U32) \
724  DECL_CONVERT(EXT, U32, U16) \
725  DECL_CONVERT(EXT, U8, F32) \
726  DECL_CONVERT(EXT, F32, U8) \
727  DECL_CONVERT(EXT, U16, F32) \
728  DECL_CONVERT(EXT, F32, U16) \
729  DECL_EXPAND(EXT, U8, U32) \
730  DECL_MIN_MAX(EXT) \
731  DECL_SCALE(EXT) \
732  DECL_DITHER0(EXT) \
733  DECL_DITHER(EXT, 1) \
734  DECL_DITHER(EXT, 2) \
735  DECL_DITHER(EXT, 3) \
736  DECL_DITHER(EXT, 4) \
737  DECL_DITHER(EXT, 5) \
738  DECL_DITHER(EXT, 6) \
739  DECL_DITHER(EXT, 7) \
740  DECL_DITHER(EXT, 8) \
741  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
742  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
743  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
744  DECL_LINEAR(EXT, yalpha, SWS_MASK(1, 1)) \
745  DECL_LINEAR(EXT, dot3, 0x7) \
746  DECL_LINEAR(EXT, dot3a, 0x7 | SWS_MASK_ALPHA) \
747  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0) ^ SWS_MASK(0, 3)) \
748  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
749  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
750  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
751  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
752  DECL_LINEAR(EXT, affine3uv, \
753  SWS_MASK_MAT3 | SWS_MASK_OFF(1) | SWS_MASK_OFF(2)) \
754  DECL_LINEAR(EXT, affine3x, \
755  SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
756  DECL_LINEAR(EXT, affine3xa, \
757  SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
758  DECL_LINEAR(EXT, affine3xy, \
759  SWS_MASK_MAT3 ^ SWS_MASK(0, 0) ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
760  DECL_LINEAR(EXT, affine3a, \
761  SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
762  DECL_FILTERS_GENERIC(EXT, U8) \
763  DECL_FILTERS_GENERIC(EXT, U16) \
764  DECL_FILTERS_GENERIC(EXT, F32) \
765  \
766 static const SwsOpTable ops32##EXT = { \
767  .cpu_flags = AV_CPU_FLAG_##FLAG, \
768  .block_size = SIZE, \
769  .entries = { \
770  &op_read32_packed2_m2##EXT, \
771  &op_read32_packed3_m2##EXT, \
772  &op_read32_packed4_m2##EXT, \
773  &op_write32_packed2_m2##EXT, \
774  &op_write32_packed3_m2##EXT, \
775  &op_write32_packed4_m2##EXT, \
776  &op_pack_1010102_m2##EXT, \
777  &op_pack_2101010_m2##EXT, \
778  &op_unpack_1010102_m2##EXT, \
779  &op_unpack_2101010_m2##EXT, \
780  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
781  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
782  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
783  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
784  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
785  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
786  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
787  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
788  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
789  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
790  REF_COMMON_PATTERNS(min##EXT), \
791  REF_COMMON_PATTERNS(max##EXT), \
792  REF_COMMON_PATTERNS(scale##EXT), \
793  REF_COMMON_PATTERNS(dither0##EXT), \
794  &op_dither1##EXT, \
795  &op_dither2##EXT, \
796  &op_dither3##EXT, \
797  &op_dither4##EXT, \
798  &op_dither5##EXT, \
799  &op_dither6##EXT, \
800  &op_dither7##EXT, \
801  &op_dither8##EXT, \
802  &op_luma##EXT, \
803  &op_alpha##EXT, \
804  &op_lumalpha##EXT, \
805  &op_yalpha##EXT, \
806  &op_dot3##EXT, \
807  &op_dot3a##EXT, \
808  &op_row0##EXT, \
809  &op_diag3##EXT, \
810  &op_diag4##EXT, \
811  &op_diagoff3##EXT, \
812  &op_affine3##EXT, \
813  &op_affine3uv##EXT, \
814  &op_affine3x##EXT, \
815  &op_affine3xa##EXT, \
816  &op_affine3xy##EXT, \
817  &op_affine3a##EXT, \
818  REF_FILTERS(filter_fma_v, _U8##EXT), \
819  REF_FILTERS(filter_fma_v, _U16##EXT), \
820  REF_FILTERS(filter_fma_v, _F32##EXT), \
821  REF_FILTERS(filter_4x4_h, _U8##EXT), \
822  REF_FILTERS(filter_4x4_h, _U16##EXT), \
823  REF_FILTERS(filter_4x4_h, _F32##EXT), \
824  REF_FILTERS(filter_v, _U8##EXT), \
825  REF_FILTERS(filter_v, _U16##EXT), \
826  REF_FILTERS(filter_v, _F32##EXT), \
827  REF_FILTERS(filter_h, _U8##EXT), \
828  REF_FILTERS(filter_h, _U16##EXT), \
829  REF_FILTERS(filter_h, _F32##EXT), \
830  NULL \
831  }, \
832 };
833 
834 DECL_FUNCS_8(16, _m1_sse4, SSE4)
835 DECL_FUNCS_8(32, _m1_avx2, AVX2)
836 DECL_FUNCS_8(32, _m2_sse4, SSE4)
837 DECL_FUNCS_8(64, _m2_avx2, AVX2)
838 
839 DECL_FUNCS_16(16, _m1_avx2, AVX2)
840 DECL_FUNCS_16(32, _m2_avx2, AVX2)
841 
842 DECL_FUNCS_32(16, _avx2, AVX2)
843 
844 static const SwsOpTable *const tables[] = {
845  &ops8_m1_sse4,
846  &ops8_m1_avx2,
847  &ops8_m2_sse4,
848  &ops8_m2_avx2,
849  &ops16_m1_avx2,
850  &ops16_m2_avx2,
851  &ops32_avx2,
852 };
853 
854 static av_const int get_mmsize(const int cpu_flags)
855 {
857  return 64;
858  else if (cpu_flags & AV_CPU_FLAG_AVX2)
859  return 32;
860  else if (cpu_flags & AV_CPU_FLAG_SSE4)
861  return 16;
862  else
863  return AVERROR(ENOTSUP);
864 }
865 
866 /**
867  * Returns true if the operation's implementation only depends on the block
868  * size, and not the underlying pixel type
869  */
870 static bool op_is_type_invariant(const SwsOp *op)
871 {
872  switch (op->op) {
873  case SWS_OP_READ:
874  case SWS_OP_WRITE:
875  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac && !op->rw.filter;
876  case SWS_OP_SWIZZLE:
877  case SWS_OP_CLEAR:
878  return true;
879  }
880 
881  return false;
882 }
883 
884 static int movsize(const int bytes, const int mmsize)
885 {
886  return bytes <= 4 ? 4 : /* movd */
887  bytes <= 8 ? 8 : /* movq */
888  mmsize; /* movu */
889 }
890 
891 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
892 {
893  uint8_t shuffle[16];
894  int read_bytes, write_bytes;
895  int pixels;
896 
897  /* Solve the shuffle mask for one 128-bit lane only */
898  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
899  if (pixels < 0)
900  return pixels;
901 
902  /* We can't shuffle acress lanes, so restrict the vector size to XMM
903  * whenever the read/write size would be a subset of the full vector */
904  if (read_bytes < 16 || write_bytes < 16)
905  mmsize = 16;
906 
907  const int num_lanes = mmsize / 16;
908  const int in_total = num_lanes * read_bytes;
909  const int out_total = num_lanes * write_bytes;
910 
911  *out = (SwsCompiledOp) {
912  .priv = av_memdup(shuffle, sizeof(shuffle)),
913  .free = av_free,
914  .slice_align = 1,
915  .block_size = pixels * num_lanes,
916  .over_read = movsize(in_total, mmsize) - in_total,
917  .over_write = movsize(out_total, mmsize) - out_total,
918  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
919  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
921  };
922 
923  if (!out->priv)
924  return AVERROR(ENOMEM);
925 
926 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
927 do { \
928  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
929  if (in_total == IN && out_total == OUT) \
930  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
931 } while (0)
932 
933  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
934  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
935  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
936  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
937  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
938  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
939  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
940  ASSIGN_SHUFFLE_FUNC(15, 5, sse4);
941  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
942  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
943  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
944  ASSIGN_SHUFFLE_FUNC(16, 4, sse4);
945  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
946  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
947  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
948  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
949  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
950  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
951  av_assert1(out->func);
952  return 0;
953 }
954 
955 /* Normalize clear values into 32-bit integer constants */
956 static void normalize_clear(SwsOp *op)
957 {
958  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
959  SwsImplResult res;
960  union {
961  uint32_t u32;
962  int i;
963  } c;
964 
965  ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, &res);
966 
967  for (int i = 0; i < 4; i++) {
968  if (!SWS_COMP_TEST(op->clear.mask, i))
969  continue;
970  switch (ff_sws_pixel_type_size(op->type)) {
971  case 1: c.u32 = 0x1010101U * res.priv.u8[i]; break;
972  case 2: c.u32 = (uint32_t) res.priv.u16[i] << 16 | res.priv.u16[i]; break;
973  case 4: c.u32 = res.priv.u32[i]; break;
974  }
975 
976  op->clear.value[i].num = c.i;
977  op->clear.value[i].den = 1;
978  }
979 }
980 
982 {
983  int ret;
984  const int cpu_flags = av_get_cpu_flags();
985  const int mmsize = get_mmsize(cpu_flags);
986  if (mmsize < 0)
987  return mmsize;
988 
989  /* Special fast path for in-place packed shuffle */
990  ret = solve_shuffle(ops, mmsize, out);
991  if (ret != AVERROR(ENOTSUP))
992  return ret;
993 
995  if (!chain)
996  return AVERROR(ENOMEM);
997 
998  *out = (SwsCompiledOp) {
999  .priv = chain,
1000  .slice_align = 1,
1002 
1003  /* Use at most two full YMM regs during the widest precision section */
1004  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
1005  };
1006 
1007  for (int i = 0; i < ops->num_ops; i++) {
1008  int op_block_size = out->block_size;
1009  SwsOp *op = &ops->ops[i];
1010 
1011  if (op_is_type_invariant(op)) {
1012  if (op->op == SWS_OP_CLEAR)
1014  op_block_size *= ff_sws_pixel_type_size(op->type);
1015  op->type = SWS_PIXEL_U8;
1016  }
1017 
1019  op, op_block_size, chain);
1020  if (ret < 0) {
1021  av_log(ctx, AV_LOG_TRACE, "Failed to compile op %d\n", i);
1022  ff_sws_op_chain_free(chain);
1023  return ret;
1024  }
1025  }
1026 
1027 #define ASSIGN_PROCESS_FUNC(NAME) \
1028  do { \
1029  SWS_DECL_FUNC(NAME); \
1030  out->func = NAME; \
1031  } while (0)
1032 
1033  const SwsOp *read = ff_sws_op_list_input(ops);
1034  const SwsOp *write = ff_sws_op_list_output(ops);
1035  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
1036  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
1037  switch (FFMAX(read_planes, write_planes)) {
1038  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
1039  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
1040  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
1041  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
1042  }
1043 
1044  if (ret < 0) {
1045  ff_sws_op_chain_free(chain);
1046  return ret;
1047  }
1048 
1049  out->cpu_flags = chain->cpu_flags;
1050  out->over_read = chain->over_read;
1051  out->over_write = chain->over_write;
1052  return 0;
1053 }
1054 
1056  .name = "x86",
1057  .compile = compile,
1058  .hw_format = AV_PIX_FMT_NONE,
1059 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:50
SwsOpTable
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:159
check_filter_fma
static bool check_filter_fma(const SwsImplParams *params)
Definition: ops.c:291
SWS_PIXEL_U16
@ SWS_PIXEL_U16
Definition: ops.h:36
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:53
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ff_sws_setup_clear
int ff_sws_setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:280
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:854
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:671
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:62
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:748
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:1055
matrix
Definition: vc1dsp.c:43
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, const SwsOp *op, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:180
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:243
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:956
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:64
av_const
#define av_const
Definition: attributes.h:113
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:157
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:713
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:267
SwsOpBackend::name
const char * name
Definition: ops_dispatch.h:131
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:77
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpTable::block_size
int block_size
Definition: ops_chain.h:161
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:54
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:209
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: ops.h:38
SwsOpList::num_ops
int num_ops
Definition: ops.h:290
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:35
SWS_COMP_TEST
#define SWS_COMP_TEST(mask, X)
Definition: ops.h:89
ff_sws_pixel_type_is_int
bool ff_sws_pixel_type_is_int(SwsPixelType type)
Definition: ops.c:92
AVRational::num
int num
Numerator.
Definition: rational.h:59
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:62
SwsOpChain::over_read
int over_read
Definition: ops_chain.h:90
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
Q
#define Q(q)
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(SwsOpPriv *)
Definition: ops_chain.h:87
avassert.h
AV_LOG_TRACE
#define AV_LOG_TRACE
Extremely verbose debugging, useful for libav* development.
Definition: log.h:236
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
AVFormatContext::flags
int flags
Flags modifying the (de)muxer behaviour.
Definition: avformat.h:1435
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
setup_clear
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:132
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:680
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_dispatch.h:130
SwsReadWriteOp::kernel
SwsFilterWeights * kernel
Definition: ops.h:138
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
SwsImplParams::op
const SwsOp * op
Definition: ops_chain.h:107
tables
static const SwsOpTable *const tables[]
Definition: ops.c:844
check_filter_4x4_h
static bool check_filter_4x4_h(const SwsImplParams *params)
Definition: ops.c:430
setup_rw
static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:61
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:891
setup_filter_4x4_h
static int setup_filter_4x4_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:454
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SwsImplParams
Definition: ops_chain.h:105
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOp::type
SwsPixelType type
Definition: ops.h:240
movsize
static int movsize(const int bytes, const int mmsize)
Definition: ops.c:884
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:50
size
int size
Definition: twinvq_data.h:10344
setup_swap_bytes
static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:102
SwsShiftOp::amount
uint8_t amount
Definition: ops.h:165
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:51
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:52
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:981
SwsImplParams::ctx
SwsContext * ctx
Definition: ops_chain.h:108
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SwsOpList::ops
SwsOp * ops
Definition: ops.h:289
weights
static const int weights[]
Definition: hevc_pel.c:32
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:870
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
hscale_sizeof_weight
static int hscale_sizeof_weight(const SwsOp *op)
Definition: ops.c:336
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:238
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:149
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_dispatch.h:100
setup_shift
static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:166
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
SwsImplResult::priv
SwsOpPriv priv
Definition: ops_chain.h:113
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:128
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:798
SwsOp::shift
SwsShiftOp shift
Definition: ops.h:246
av_mul_q
AVRational av_mul_q(AVRational b, AVRational c)
Multiply two rationals.
Definition: rational.c:80
SwsReadWriteOp::elems
uint8_t elems
Examples: rgba = 4x u8 packed yuv444p = 3x u8 rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack monow = ...
Definition: ops.h:126
mem.h
w
uint8_t w
Definition: llvidencdsp.c:39
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:278
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:316
int32_t
int32_t
Definition: audioconvert.c:56
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:671
stride
#define stride
Definition: h264pred_template.c:536
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h264.c:190
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:288
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:541
SwsContext
Main external API structure.
Definition: swscale.h:206
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
SwsImplResult
Definition: ops_chain.h:111
SwsImplParams::table
const SwsOpTable * table
Definition: ops_chain.h:106
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
SwsOpChain::over_write
int over_write
Definition: ops_chain.h:91
min
float min
Definition: vorbis_enc_data.h:429
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:346