FFmpeg
ops_optimizer.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/avassert.h"
22 #include <libavutil/bswap.h>
23 #include "libavutil/rational.h"
24 
25 #include "ops.h"
26 #include "ops_internal.h"
27 
28 #define RET(x) \
29  do { \
30  if ((ret = (x)) < 0) \
31  return ret; \
32  } while (0)
33 
34 /* Returns true for operations that are independent per channel. These can
35  * usually be commuted freely other such operations. */
37 {
38  switch (op) {
39  case SWS_OP_SWAP_BYTES:
40  case SWS_OP_LSHIFT:
41  case SWS_OP_RSHIFT:
42  case SWS_OP_CONVERT:
43  case SWS_OP_DITHER:
44  case SWS_OP_MIN:
45  case SWS_OP_MAX:
46  case SWS_OP_SCALE:
47  return true;
48  case SWS_OP_INVALID:
49  case SWS_OP_READ:
50  case SWS_OP_WRITE:
51  case SWS_OP_SWIZZLE:
52  case SWS_OP_CLEAR:
53  case SWS_OP_LINEAR:
54  case SWS_OP_PACK:
55  case SWS_OP_UNPACK:
56  return false;
57  case SWS_OP_TYPE_NB:
58  break;
59  }
60 
61  av_unreachable("Invalid operation type!");
62  return false;
63 }
64 
65 /* merge_comp_flags() forms a monoid with flags_identity as the null element */
66 static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
67 static unsigned merge_comp_flags(unsigned a, unsigned b)
68 {
69  const unsigned flags_or = SWS_COMP_GARBAGE;
70  const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
71  return ((a & b) & flags_and) | ((a | b) & flags_or);
72 }
73 
74 /* Infer + propagate known information about components */
76 {
77  SwsComps next = { .unused = {true, true, true, true} };
78  SwsComps prev = { .flags = {
80  }};
81 
82  /* Forwards pass, propagates knowledge about the incoming pixel values */
83  for (int n = 0; n < ops->num_ops; n++) {
84  SwsOp *op = &ops->ops[n];
85 
86  /* Prefill min/max values automatically; may have to be fixed in
87  * special cases */
88  memcpy(op->comps.min, prev.min, sizeof(prev.min));
89  memcpy(op->comps.max, prev.max, sizeof(prev.max));
90 
91  if (op->op != SWS_OP_SWAP_BYTES) {
92  ff_sws_apply_op_q(op, op->comps.min);
93  ff_sws_apply_op_q(op, op->comps.max);
94  }
95 
96  switch (op->op) {
97  case SWS_OP_READ:
98  for (int i = 0; i < op->rw.elems; i++) {
99  if (ff_sws_pixel_type_is_int(op->type)) {
100  int bits = 8 * ff_sws_pixel_type_size(op->type);
101  if (!op->rw.packed && ops->src.desc) {
102  /* Use legal value range from pixdesc if available;
103  * we don't need to do this for packed formats because
104  * non-byte-aligned packed formats will necessarily go
105  * through SWS_OP_UNPACK anyway */
106  for (int c = 0; c < 4; c++) {
107  if (ops->src.desc->comp[c].plane == i) {
108  bits = ops->src.desc->comp[c].depth;
109  break;
110  }
111  }
112  }
113 
114  op->comps.flags[i] = SWS_COMP_EXACT;
115  op->comps.min[i] = Q(0);
116  op->comps.max[i] = Q((1ULL << bits) - 1);
117  }
118  }
119  for (int i = op->rw.elems; i < 4; i++)
120  op->comps.flags[i] = prev.flags[i];
121  break;
122  case SWS_OP_WRITE:
123  for (int i = 0; i < op->rw.elems; i++)
124  av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
125  /* fall through */
126  case SWS_OP_SWAP_BYTES:
127  case SWS_OP_LSHIFT:
128  case SWS_OP_RSHIFT:
129  case SWS_OP_MIN:
130  case SWS_OP_MAX:
131  /* Linearly propagate flags per component */
132  for (int i = 0; i < 4; i++)
133  op->comps.flags[i] = prev.flags[i];
134  break;
135  case SWS_OP_DITHER:
136  /* Strip zero flag because of the nonzero dithering offset */
137  for (int i = 0; i < 4; i++)
138  op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
139  break;
140  case SWS_OP_UNPACK:
141  for (int i = 0; i < 4; i++) {
142  if (op->pack.pattern[i])
143  op->comps.flags[i] = prev.flags[0];
144  else
145  op->comps.flags[i] = SWS_COMP_GARBAGE;
146  }
147  break;
148  case SWS_OP_PACK: {
149  unsigned flags = flags_identity;
150  for (int i = 0; i < 4; i++) {
151  if (op->pack.pattern[i])
152  flags = merge_comp_flags(flags, prev.flags[i]);
153  if (i > 0) /* clear remaining comps for sanity */
154  op->comps.flags[i] = SWS_COMP_GARBAGE;
155  }
156  op->comps.flags[0] = flags;
157  break;
158  }
159  case SWS_OP_CLEAR:
160  for (int i = 0; i < 4; i++) {
161  if (op->c.q4[i].den) {
162  if (op->c.q4[i].num == 0) {
163  op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT;
164  } else if (op->c.q4[i].den == 1) {
165  op->comps.flags[i] = SWS_COMP_EXACT;
166  }
167  } else {
168  op->comps.flags[i] = prev.flags[i];
169  }
170  }
171  break;
172  case SWS_OP_SWIZZLE:
173  for (int i = 0; i < 4; i++)
174  op->comps.flags[i] = prev.flags[op->swizzle.in[i]];
175  break;
176  case SWS_OP_CONVERT:
177  for (int i = 0; i < 4; i++) {
178  op->comps.flags[i] = prev.flags[i];
179  if (ff_sws_pixel_type_is_int(op->convert.to))
180  op->comps.flags[i] |= SWS_COMP_EXACT;
181  }
182  break;
183  case SWS_OP_LINEAR:
184  for (int i = 0; i < 4; i++) {
185  unsigned flags = flags_identity;
186  AVRational min = Q(0), max = Q(0);
187  for (int j = 0; j < 4; j++) {
188  const AVRational k = op->lin.m[i][j];
189  AVRational mink = av_mul_q(prev.min[j], k);
190  AVRational maxk = av_mul_q(prev.max[j], k);
191  if (k.num) {
192  flags = merge_comp_flags(flags, prev.flags[j]);
193  if (k.den != 1) /* fractional coefficient */
194  flags &= ~SWS_COMP_EXACT;
195  if (k.num < 0)
196  FFSWAP(AVRational, mink, maxk);
197  min = av_add_q(min, mink);
198  max = av_add_q(max, maxk);
199  }
200  }
201  if (op->lin.m[i][4].num) { /* nonzero offset */
202  flags &= ~SWS_COMP_ZERO;
203  if (op->lin.m[i][4].den != 1) /* fractional offset */
204  flags &= ~SWS_COMP_EXACT;
205  min = av_add_q(min, op->lin.m[i][4]);
206  max = av_add_q(max, op->lin.m[i][4]);
207  }
208  op->comps.flags[i] = flags;
209  op->comps.min[i] = min;
210  op->comps.max[i] = max;
211  }
212  break;
213  case SWS_OP_SCALE:
214  for (int i = 0; i < 4; i++) {
215  op->comps.flags[i] = prev.flags[i];
216  if (op->c.q.den != 1) /* fractional scale */
217  op->comps.flags[i] &= ~SWS_COMP_EXACT;
218  if (op->c.q.num < 0)
219  FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]);
220  }
221  break;
222 
223  case SWS_OP_INVALID:
224  case SWS_OP_TYPE_NB:
225  av_unreachable("Invalid operation type!");
226  }
227 
228  prev = op->comps;
229  }
230 
231  /* Backwards pass, solves for component dependencies */
232  for (int n = ops->num_ops - 1; n >= 0; n--) {
233  SwsOp *op = &ops->ops[n];
234 
235  switch (op->op) {
236  case SWS_OP_READ:
237  case SWS_OP_WRITE:
238  for (int i = 0; i < op->rw.elems; i++)
239  op->comps.unused[i] = op->op == SWS_OP_READ;
240  for (int i = op->rw.elems; i < 4; i++)
241  op->comps.unused[i] = next.unused[i];
242  break;
243  case SWS_OP_SWAP_BYTES:
244  case SWS_OP_LSHIFT:
245  case SWS_OP_RSHIFT:
246  case SWS_OP_CONVERT:
247  case SWS_OP_DITHER:
248  case SWS_OP_MIN:
249  case SWS_OP_MAX:
250  case SWS_OP_SCALE:
251  for (int i = 0; i < 4; i++)
252  op->comps.unused[i] = next.unused[i];
253  break;
254  case SWS_OP_UNPACK: {
255  bool unused = true;
256  for (int i = 0; i < 4; i++) {
257  if (op->pack.pattern[i])
258  unused &= next.unused[i];
259  op->comps.unused[i] = i > 0;
260  }
261  op->comps.unused[0] = unused;
262  break;
263  }
264  case SWS_OP_PACK:
265  for (int i = 0; i < 4; i++) {
266  if (op->pack.pattern[i])
267  op->comps.unused[i] = next.unused[0];
268  else
269  op->comps.unused[i] = true;
270  }
271  break;
272  case SWS_OP_CLEAR:
273  for (int i = 0; i < 4; i++) {
274  if (op->c.q4[i].den)
275  op->comps.unused[i] = true;
276  else
277  op->comps.unused[i] = next.unused[i];
278  }
279  break;
280  case SWS_OP_SWIZZLE: {
281  bool unused[4] = { true, true, true, true };
282  for (int i = 0; i < 4; i++)
283  unused[op->swizzle.in[i]] &= next.unused[i];
284  for (int i = 0; i < 4; i++)
285  op->comps.unused[i] = unused[i];
286  break;
287  }
288  case SWS_OP_LINEAR:
289  for (int j = 0; j < 4; j++) {
290  bool unused = true;
291  for (int i = 0; i < 4; i++) {
292  if (op->lin.m[i][j].num)
293  unused &= next.unused[i];
294  }
295  op->comps.unused[j] = unused;
296  }
297  break;
298  }
299 
300  next = op->comps;
301  }
302 }
303 
304 /* returns log2(x) only if x is a power of two, or 0 otherwise */
305 static int exact_log2(const int x)
306 {
307  int p;
308  if (x <= 0)
309  return 0;
310  p = av_log2(x);
311  return (1 << p) == x ? p : 0;
312 }
313 
314 static int exact_log2_q(const AVRational x)
315 {
316  if (x.den == 1)
317  return exact_log2(x.num);
318  else if (x.num == 1)
319  return -exact_log2(x.den);
320  else
321  return 0;
322 }
323 
324 /**
325  * If a linear operation can be reduced to a scalar multiplication, returns
326  * the corresponding scaling factor, or 0 otherwise.
327  */
328 static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
329  SwsConst *out_scale)
330 {
331  SwsConst scale = {0};
332 
333  /* There are components not on the main diagonal */
334  if (c->mask & ~SWS_MASK_DIAG4)
335  return false;
336 
337  for (int i = 0; i < 4; i++) {
338  const AVRational s = c->m[i][i];
339  if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i])
340  continue;
341  if (scale.q.den && av_cmp_q(s, scale.q))
342  return false;
343  scale.q = s;
344  }
345 
346  if (scale.q.den)
347  *out_scale = scale;
348  return scale.q.den;
349 }
350 
351 /* Extracts an integer clear operation (subset) from the given linear op. */
353  SwsConst *out_clear)
354 {
355  SwsConst clear = {0};
356  bool ret = false;
357 
358  for (int i = 0; i < 4; i++) {
359  bool const_row = c->m[i][4].den == 1; /* offset is integer */
360  for (int j = 0; j < 4; j++) {
361  const_row &= c->m[i][j].num == 0 || /* scalar is zero */
362  (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
363  }
364  if (const_row && (c->mask & SWS_MASK_ROW(i))) {
365  clear.q4[i] = c->m[i][4];
366  for (int j = 0; j < 5; j++)
367  c->m[i][j] = Q(i == j);
368  c->mask &= ~SWS_MASK_ROW(i);
369  ret = true;
370  }
371  }
372 
373  if (ret)
374  *out_clear = clear;
375  return ret;
376 }
377 
378 /* Unswizzle a linear operation by aligning single-input rows with
379  * their corresponding diagonal */
380 static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
381 {
382  SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
383  SwsLinearOp c = *op;
384 
385  for (int i = 0; i < 4; i++) {
386  int idx = -1;
387  for (int j = 0; j < 4; j++) {
388  if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
389  continue;
390  if (idx >= 0)
391  return false; /* multiple inputs */
392  idx = j;
393  }
394 
395  if (idx >= 0 && idx != i) {
396  /* Move coefficient to the diagonal */
397  c.m[i][i] = c.m[i][idx];
398  c.m[i][idx] = Q(0);
399  swiz.in[i] = idx;
400  }
401  }
402 
403  if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
404  return false; /* no swizzle was identified */
405 
406  c.mask = ff_sws_linear_mask(c);
407  *out_swiz = swiz;
408  *op = c;
409  return true;
410 }
411 
413 {
414  int ret;
415 
416 retry:
418 
419  for (int n = 0; n < ops->num_ops;) {
420  SwsOp dummy = {0};
421  SwsOp *op = &ops->ops[n];
422  SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
423  SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;
424 
425  /* common helper variable */
426  bool noop = true;
427 
428  switch (op->op) {
429  case SWS_OP_READ:
430  /* Optimized further into refcopy / memcpy */
431  if (next->op == SWS_OP_WRITE &&
432  next->rw.elems == op->rw.elems &&
433  next->rw.packed == op->rw.packed &&
434  next->rw.frac == op->rw.frac)
435  {
436  ff_sws_op_list_remove_at(ops, n, 2);
437  av_assert1(ops->num_ops == 0);
438  return 0;
439  }
440 
441  /* Skip reading extra unneeded components */
442  if (!op->rw.packed) {
443  int needed = op->rw.elems;
444  while (needed > 0 && next->comps.unused[needed - 1])
445  needed--;
446  if (op->rw.elems != needed) {
447  op->rw.elems = needed;
448  goto retry;
449  }
450  }
451  break;
452 
453  case SWS_OP_SWAP_BYTES:
454  /* Redundant (double) swap */
455  if (next->op == SWS_OP_SWAP_BYTES) {
456  ff_sws_op_list_remove_at(ops, n, 2);
457  goto retry;
458  }
459  break;
460 
461  case SWS_OP_UNPACK:
462  /* Redundant unpack+pack */
463  if (next->op == SWS_OP_PACK && next->type == op->type &&
464  next->pack.pattern[0] == op->pack.pattern[0] &&
465  next->pack.pattern[1] == op->pack.pattern[1] &&
466  next->pack.pattern[2] == op->pack.pattern[2] &&
467  next->pack.pattern[3] == op->pack.pattern[3])
468  {
469  ff_sws_op_list_remove_at(ops, n, 2);
470  goto retry;
471  }
472  break;
473 
474  case SWS_OP_LSHIFT:
475  case SWS_OP_RSHIFT:
476  /* Two shifts in the same direction */
477  if (next->op == op->op) {
478  op->c.u += next->c.u;
479  ff_sws_op_list_remove_at(ops, n + 1, 1);
480  goto retry;
481  }
482 
483  /* No-op shift */
484  if (!op->c.u) {
485  ff_sws_op_list_remove_at(ops, n, 1);
486  goto retry;
487  }
488  break;
489 
490  case SWS_OP_CLEAR:
491  for (int i = 0; i < 4; i++) {
492  if (!op->c.q4[i].den)
493  continue;
494 
495  if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
496  !(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
497  op->c.q4[i].num == 0)
498  {
499  /* Redundant clear-to-zero of zero component */
500  op->c.q4[i].den = 0;
501  } else if (next->comps.unused[i]) {
502  /* Unnecessary clear of unused component */
503  op->c.q4[i] = (AVRational) {0, 0};
504  } else if (op->c.q4[i].den) {
505  noop = false;
506  }
507  }
508 
509  if (noop) {
510  ff_sws_op_list_remove_at(ops, n, 1);
511  goto retry;
512  }
513 
514  /* Transitive clear */
515  if (next->op == SWS_OP_CLEAR) {
516  for (int i = 0; i < 4; i++) {
517  if (next->c.q4[i].den)
518  op->c.q4[i] = next->c.q4[i];
519  }
520  ff_sws_op_list_remove_at(ops, n + 1, 1);
521  goto retry;
522  }
523 
524  /* Prefer to clear as late as possible, to avoid doing
525  * redundant work */
526  if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) ||
527  next->op == SWS_OP_SWIZZLE)
528  {
529  if (next->op == SWS_OP_CONVERT)
530  op->type = next->convert.to;
531  ff_sws_apply_op_q(next, op->c.q4);
532  FFSWAP(SwsOp, *op, *next);
533  goto retry;
534  }
535  break;
536 
537  case SWS_OP_SWIZZLE: {
538  bool seen[4] = {0};
539  bool has_duplicates = false;
540  for (int i = 0; i < 4; i++) {
541  if (next->comps.unused[i])
542  continue;
543  if (op->swizzle.in[i] != i)
544  noop = false;
545  has_duplicates |= seen[op->swizzle.in[i]];
546  seen[op->swizzle.in[i]] = true;
547  }
548 
549  /* Identity swizzle */
550  if (noop) {
551  ff_sws_op_list_remove_at(ops, n, 1);
552  goto retry;
553  }
554 
555  /* Transitive swizzle */
556  if (next->op == SWS_OP_SWIZZLE) {
557  const SwsSwizzleOp orig = op->swizzle;
558  for (int i = 0; i < 4; i++)
559  op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
560  ff_sws_op_list_remove_at(ops, n + 1, 1);
561  goto retry;
562  }
563 
564  /* Try to push swizzles with duplicates towards the output */
565  if (has_duplicates && op_type_is_independent(next->op)) {
566  if (next->op == SWS_OP_CONVERT)
567  op->type = next->convert.to;
568  if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) {
569  /* Un-swizzle the next operation */
570  const SwsConst c = next->c;
571  for (int i = 0; i < 4; i++) {
572  if (!next->comps.unused[i])
573  next->c.q4[op->swizzle.in[i]] = c.q4[i];
574  }
575  }
576  FFSWAP(SwsOp, *op, *next);
577  goto retry;
578  }
579 
580  /* Move swizzle out of the way between two converts so that
581  * they may be merged */
582  if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) {
583  op->type = next->convert.to;
584  FFSWAP(SwsOp, *op, *next);
585  goto retry;
586  }
587  break;
588  }
589 
590  case SWS_OP_CONVERT:
591  /* No-op conversion */
592  if (op->type == op->convert.to) {
593  ff_sws_op_list_remove_at(ops, n, 1);
594  goto retry;
595  }
596 
597  /* Transitive conversion */
598  if (next->op == SWS_OP_CONVERT &&
599  op->convert.expand == next->convert.expand)
600  {
601  av_assert1(op->convert.to == next->type);
602  op->convert.to = next->convert.to;
603  ff_sws_op_list_remove_at(ops, n + 1, 1);
604  goto retry;
605  }
606 
607  /* Conversion followed by integer expansion */
608  if (next->op == SWS_OP_SCALE && !op->convert.expand &&
609  !av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to)))
610  {
611  op->convert.expand = true;
612  ff_sws_op_list_remove_at(ops, n + 1, 1);
613  goto retry;
614  }
615  break;
616 
617  case SWS_OP_MIN:
618  for (int i = 0; i < 4; i++) {
619  if (next->comps.unused[i] || !op->c.q4[i].den)
620  continue;
621  if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
622  noop = false;
623  }
624 
625  if (noop) {
626  ff_sws_op_list_remove_at(ops, n, 1);
627  goto retry;
628  }
629  break;
630 
631  case SWS_OP_MAX:
632  for (int i = 0; i < 4; i++) {
633  if (next->comps.unused[i] || !op->c.q4[i].den)
634  continue;
635  if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
636  noop = false;
637  }
638 
639  if (noop) {
640  ff_sws_op_list_remove_at(ops, n, 1);
641  goto retry;
642  }
643  break;
644 
645  case SWS_OP_DITHER:
646  for (int i = 0; i < 4; i++) {
647  noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) ||
648  next->comps.unused[i];
649  }
650 
651  if (noop) {
652  ff_sws_op_list_remove_at(ops, n, 1);
653  goto retry;
654  }
655  break;
656 
657  case SWS_OP_LINEAR: {
658  SwsSwizzleOp swizzle;
659  SwsConst c;
660 
661  /* No-op (identity) linear operation */
662  if (!op->lin.mask) {
663  ff_sws_op_list_remove_at(ops, n, 1);
664  goto retry;
665  }
666 
667  if (next->op == SWS_OP_LINEAR) {
668  /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
669  const SwsLinearOp m1 = op->lin;
670  const SwsLinearOp m2 = next->lin;
671  for (int i = 0; i < 4; i++) {
672  for (int j = 0; j < 5; j++) {
673  AVRational sum = Q(0);
674  for (int k = 0; k < 4; k++)
675  sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
676  if (j == 4) /* m1.m[4][j] == 1 */
677  sum = av_add_q(sum, m2.m[i][4]);
678  op->lin.m[i][j] = sum;
679  }
680  }
681  op->lin.mask = ff_sws_linear_mask(op->lin);
682  ff_sws_op_list_remove_at(ops, n + 1, 1);
683  goto retry;
684  }
685 
686  /* Optimize away zero columns */
687  for (int j = 0; j < 4; j++) {
688  const uint32_t col = SWS_MASK_COL(j);
689  if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col))
690  continue;
691  for (int i = 0; i < 4; i++)
692  op->lin.m[i][j] = Q(i == j);
693  op->lin.mask &= ~col;
694  goto retry;
695  }
696 
697  /* Optimize away unused rows */
698  for (int i = 0; i < 4; i++) {
699  const uint32_t row = SWS_MASK_ROW(i);
700  if (!next->comps.unused[i] || !(op->lin.mask & row))
701  continue;
702  for (int j = 0; j < 5; j++)
703  op->lin.m[i][j] = Q(i == j);
704  op->lin.mask &= ~row;
705  goto retry;
706  }
707 
708  /* Convert constant rows to explicit clear instruction */
709  if (extract_constant_rows(&op->lin, prev->comps, &c)) {
710  RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
711  .op = SWS_OP_CLEAR,
712  .type = op->type,
713  .comps = op->comps,
714  .c = c,
715  }));
716  goto retry;
717  }
718 
719  /* Multiplication by scalar constant */
720  if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
721  op->op = SWS_OP_SCALE;
722  op->c = c;
723  goto retry;
724  }
725 
726  /* Swizzle by fixed pattern */
727  if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
728  RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
729  .op = SWS_OP_SWIZZLE,
730  .type = op->type,
731  .swizzle = swizzle,
732  }));
733  goto retry;
734  }
735  break;
736  }
737 
738  case SWS_OP_SCALE: {
739  const int factor2 = exact_log2_q(op->c.q);
740 
741  /* No-op scaling */
742  if (op->c.q.num == 1 && op->c.q.den == 1) {
743  ff_sws_op_list_remove_at(ops, n, 1);
744  goto retry;
745  }
746 
747  /* Scaling by integer before conversion to int */
748  if (op->c.q.den == 1 &&
749  next->op == SWS_OP_CONVERT &&
751  {
752  op->type = next->convert.to;
753  FFSWAP(SwsOp, *op, *next);
754  goto retry;
755  }
756 
757  /* Scaling by exact power of two */
758  if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
759  op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
760  op->c.u = FFABS(factor2);
761  goto retry;
762  }
763  break;
764  }
765  }
766 
767  /* No optimization triggered, move on to next operation */
768  n++;
769  }
770 
771  return 0;
772 }
773 
774 int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
775  int size, uint8_t clear_val,
776  int *read_bytes, int *write_bytes)
777 {
778  const SwsOp read = ops->ops[0];
779  const int read_size = ff_sws_pixel_type_size(read.type);
780  uint32_t mask[4] = {0};
781 
782  if (!ops->num_ops || read.op != SWS_OP_READ)
783  return AVERROR(EINVAL);
784  if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
785  return AVERROR(ENOTSUP);
786 
787  for (int i = 0; i < read.rw.elems; i++)
788  mask[i] = 0x01010101 * i * read_size + 0x03020100;
789 
790  for (int opidx = 1; opidx < ops->num_ops; opidx++) {
791  const SwsOp *op = &ops->ops[opidx];
792  switch (op->op) {
793  case SWS_OP_SWIZZLE: {
794  uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
795  for (int i = 0; i < 4; i++)
796  mask[i] = orig[op->swizzle.in[i]];
797  break;
798  }
799 
800  case SWS_OP_SWAP_BYTES:
801  for (int i = 0; i < 4; i++) {
802  switch (ff_sws_pixel_type_size(op->type)) {
803  case 2: mask[i] = av_bswap16(mask[i]); break;
804  case 4: mask[i] = av_bswap32(mask[i]); break;
805  }
806  }
807  break;
808 
809  case SWS_OP_CLEAR:
810  for (int i = 0; i < 4; i++) {
811  if (!op->c.q4[i].den)
812  continue;
813  if (op->c.q4[i].num != 0 || !clear_val)
814  return AVERROR(ENOTSUP);
815  mask[i] = 0x1010101ul * clear_val;
816  }
817  break;
818 
819  case SWS_OP_CONVERT: {
820  if (!op->convert.expand)
821  return AVERROR(ENOTSUP);
822  for (int i = 0; i < 4; i++) {
823  switch (ff_sws_pixel_type_size(op->type)) {
824  case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break;
825  case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
826  }
827  }
828  break;
829  }
830 
831  case SWS_OP_WRITE: {
832  if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1))
833  return AVERROR(ENOTSUP);
834 
835  /* Initialize to no-op */
836  memset(shuffle, clear_val, size);
837 
838  const int write_size = ff_sws_pixel_type_size(op->type);
839  const int read_chunk = read.rw.elems * read_size;
840  const int write_chunk = op->rw.elems * write_size;
841  const int num_groups = size / FFMAX(read_chunk, write_chunk);
842  for (int n = 0; n < num_groups; n++) {
843  const int base_in = n * read_chunk;
844  const int base_out = n * write_chunk;
845  for (int i = 0; i < op->rw.elems; i++) {
846  const int offset = base_out + i * write_size;
847  for (int b = 0; b < write_size; b++) {
848  const uint8_t idx = mask[i] >> (b * 8);
849  if (idx != clear_val)
850  shuffle[offset + b] = base_in + idx;
851  }
852  }
853  }
854 
855  *read_bytes = num_groups * read_chunk;
856  *write_bytes = num_groups * write_chunk;
857  return num_groups;
858  }
859 
860  default:
861  return AVERROR(ENOTSUP);
862  }
863  }
864 
865  return AVERROR(EINVAL);
866 }
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:48
flags
const SwsFlags flags[]
Definition: swscale.c:61
SwsComps::flags
unsigned flags[4]
Definition: ops.h:88
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:58
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ff_sws_op_list_update_comps
void ff_sws_op_list_update_comps(SwsOpList *ops)
Infer + propagate known information about components.
Definition: ops_optimizer.c:75
SWS_OP_LSHIFT
@ SWS_OP_LSHIFT
Definition: ops.h:56
SWS_OP_UNPACK
@ SWS_OP_UNPACK
Definition: ops.h:51
SwsSwizzleOp::mask
uint32_t mask
Definition: ops.h:120
SwsConst
Definition: ops.h:77
SWS_COMP_ZERO
@ SWS_COMP_ZERO
Definition: ops.h:74
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:55
ff_sws_linear_mask
uint32_t ff_sws_linear_mask(const SwsLinearOp c)
Definition: ops.c:328
SwsOp::swizzle
SwsSwizzleOp swizzle
Definition: ops.h:186
SwsLinearOp::m
AVRational m[4][5]
Generalized 5x5 affine transformation: [ Out.x ] = [ A B C D E ] [ Out.y ] = [ F G H I J ] * [ x y z ...
Definition: ops.h:151
SwsComps::unused
bool unused[4]
Definition: ops.h:89
SwsOp::convert
SwsConvertOp convert
Definition: ops.h:187
rational.h
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:184
ops.h
SWS_OP_DITHER
@ SWS_OP_DITHER
Definition: ops.h:60
AVComponentDescriptor::depth
int depth
Number of bits in the component.
Definition: pixdesc.h:57
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
b
#define b
Definition: input.c:42
ff_sws_op_list_optimize
int ff_sws_op_list_optimize(SwsOpList *ops)
Fuse compatible and eliminate redundant operations, as well as replacing some operations with more ef...
Definition: ops_optimizer.c:412
max
#define max(a, b)
Definition: cuda_runtime.h:33
SWS_OP_TYPE_NB
@ SWS_OP_TYPE_NB
Definition: ops.h:68
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:64
SWS_MASK_ROW
#define SWS_MASK_ROW(I)
Definition: ops.h:157
SwsComps::max
AVRational max[4]
Definition: ops.h:93
SWS_MASK_DIAG4
@ SWS_MASK_DIAG4
Definition: ops.h:171
SwsOpList::num_ops
int num_ops
Definition: ops.h:211
SWS_MASK_COL
#define SWS_MASK_COL(J)
Definition: ops.h:158
dummy
int dummy
Definition: motion.c:66
SwsOp::c
SwsConst c
Definition: ops.h:189
SwsSwizzleOp
Definition: ops.h:114
ff_sws_pixel_type_is_int
bool ff_sws_pixel_type_is_int(SwsPixelType type)
Definition: ops.c:79
AVRational::num
int num
Numerator.
Definition: rational.h:59
SwsOp::op
SwsOpType op
Definition: ops.h:180
Q
#define Q(q)
SWS_OP_SCALE
@ SWS_OP_SCALE
Definition: ops.h:64
avassert.h
s
#define s(width, name)
Definition: cbs_vp9.c:198
SWS_SWIZZLE
#define SWS_SWIZZLE(X, Y, Z, W)
Definition: ops.h:126
SwsComps::min
AVRational min[4]
Definition: ops.h:93
read_chunk
static int read_chunk(AVFormatContext *s)
Definition: dhav.c:173
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
bits
uint8_t bits
Definition: vp3data.h:128
SWS_OP_MIN
@ SWS_OP_MIN
Definition: ops.h:65
exact_log2_q
static int exact_log2_q(const AVRational x)
Definition: ops_optimizer.c:314
ff_sws_pixel_expand
static AVRational ff_sws_pixel_expand(SwsPixelType from, SwsPixelType to)
Definition: ops_internal.h:30
SWS_OP_LINEAR
@ SWS_OP_LINEAR
Definition: ops.h:63
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
SWS_OP_PACK
@ SWS_OP_PACK
Definition: ops.h:52
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
av_unreachable
#define av_unreachable(msg)
Asserts that are used as compiler optimization hints depending upon ASSERT_LEVEL and NBDEBUG.
Definition: avassert.h:108
SwsReadWriteOp::frac
uint8_t frac
Definition: ops.h:98
AVComponentDescriptor::plane
int plane
Which of the 4 planes contains the component.
Definition: pixdesc.h:34
SWS_COMP_GARBAGE
@ SWS_COMP_GARBAGE
Definition: ops.h:72
SwsConvertOp::to
SwsPixelType to
Definition: ops.h:129
SwsOpType
SwsOpType
Definition: ops.h:44
ff_sws_op_list_remove_at
void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count)
Definition: ops.c:288
RET
#define RET(x)
Copyright (C) 2025 Niklas Haas.
Definition: ops_optimizer.c:28
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_sws_apply_op_q
void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
Apply an operation to an AVRational.
Definition: ops.c:122
SwsConvertOp::expand
bool expand
Definition: ops.h:130
op_type_is_independent
static bool op_type_is_independent(SwsOpType op)
Definition: ops_optimizer.c:36
SwsPackOp::pattern
uint8_t pattern[4]
Definition: ops.h:111
SwsConst::q
AVRational q
Definition: ops.h:80
extract_constant_rows
static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev, SwsConst *out_clear)
Definition: ops_optimizer.c:352
av_bswap32
#define av_bswap32
Definition: bswap.h:47
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:424
SwsOp::type
SwsPixelType type
Definition: ops.h:181
ff_sws_op_list_insert_at
int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op)
Definition: ops.c:298
size
int size
Definition: twinvq_data.h:10344
SWS_OP_RSHIFT
@ SWS_OP_RSHIFT
Definition: ops.h:57
SwsOp::lin
SwsLinearOp lin
Definition: ops.h:183
SwsOpList::src
SwsFormat src
Definition: ops.h:214
SWS_OP_INVALID
@ SWS_OP_INVALID
Definition: ops.h:45
extract_scalar
static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next, SwsConst *out_scale)
If a linear operation can be reduced to a scalar multiplication, returns the corresponding scaling fa...
Definition: ops_optimizer.c:328
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:49
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
SwsOp::comps
SwsComps comps
Definition: ops.h:193
SwsLinearOp
Definition: ops.h:138
noop
#define noop(a)
Definition: h264chroma_template.c:71
merge_comp_flags
static unsigned merge_comp_flags(unsigned a, unsigned b)
Definition: ops_optimizer.c:67
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
extract_swizzle
static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
Definition: ops_optimizer.c:380
flags_identity
static const unsigned flags_identity
Definition: ops_optimizer.c:66
SwsOpList::ops
SwsOp * ops
Definition: ops.h:210
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:57
SwsFormat::desc
const AVPixFmtDescriptor * desc
Definition: format.h:84
needed
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is needed
Definition: filter_design.txt:212
SwsConst::q4
AVRational q4[4]
Definition: ops.h:79
ops_internal.h
SwsOp
Definition: ops.h:179
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
ret
ret
Definition: filter_design.txt:187
bswap.h
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
SWS_OP_MAX
@ SWS_OP_MAX
Definition: ops.h:66
SwsComps
Definition: ops.h:87
SwsConst::u
unsigned u
Definition: ops.h:81
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:99
SWS_OP_SWAP_BYTES
@ SWS_OP_SWAP_BYTES
Definition: ops.h:50
AVPixFmtDescriptor::comp
AVComponentDescriptor comp[4]
Parameters that describe how pixels are packed.
Definition: pixdesc.h:105
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:774
av_mul_q
AVRational av_mul_q(AVRational b, AVRational c)
Multiply two rationals.
Definition: rational.c:80
SWS_COMP_EXACT
@ SWS_COMP_EXACT
Definition: ops.h:73
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:97
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:273
av_add_q
AVRational av_add_q(AVRational b, AVRational c)
Add two rationals.
Definition: rational.c:93
SwsSwizzleOp::in
uint8_t in[4]
Definition: ops.h:121
SWS_OP_CONVERT
@ SWS_OP_CONVERT
Definition: ops.h:59
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:209
av_bswap16
#define av_bswap16
Definition: bswap.h:28
SwsOp::pack
SwsPackOp pack
Definition: ops.h:185
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
av_log2
int av_log2(unsigned v)
Definition: intmath.c:26
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
exact_log2
static int exact_log2(const int x)
Definition: ops_optimizer.c:305
min
float min
Definition: vorbis_enc_data.h:429