FFmpeg
ops_asmgen.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2026 Ramiro Polla
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <assert.h>
22 #include <limits.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #ifdef _WIN32
29 #include <io.h>
30 #include <fcntl.h>
31 #endif
32 
33 /**
34  * This file is compiled as a standalone build-time tool and must not depend
35  * on internal FFmpeg libraries. The necessary utils are redefined below using
36  * standard C equivalents.
37  */
38 
39 #define AVUTIL_AVASSERT_H
40 #define AVUTIL_LOG_H
41 #define AVUTIL_MACROS_H
42 #define AVUTIL_MEM_H
43 #define av_assert0(cond) assert(cond)
44 #define av_malloc(s) malloc(s)
45 #define av_mallocz(s) calloc(1, s)
46 #define av_realloc(p, s) realloc(p, s)
47 #define av_strdup(s) strdup(s)
48 #define av_free(p) free(p)
49 #define FFMAX(a,b) ((a) > (b) ? (a) : (b))
50 #define FFMIN(a,b) ((a) > (b) ? (b) : (a))
51 
52 static void av_freep(void *ptr)
53 {
54  void **pptr = (void **) ptr;
55  if (pptr) {
56  ptr = *pptr;
57  if (ptr)
58  free(ptr);
59  *pptr = NULL;
60  }
61 }
62 
63 #include "libavutil/dynarray.h"
64 
65 static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
66  const uint8_t *elem_data)
67 {
68  uint8_t *tab_elem_data = NULL;
69 
70  FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, {
71  tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size;
72  if (elem_data)
73  memcpy(tab_elem_data, elem_data, elem_size);
74  }, {
75  av_freep(tab_ptr);
76  *nb_ptr = 0;
77  });
78  return tab_elem_data;
79 }
80 
81 /*********************************************************************/
82 #include "rasm.c"
83 #include "rasm_print.c"
84 #include "ops_impl.c"
85 
86 /**
87  * Implementation parameters for all exported functions. This list is
88  * compiled by performing a dummy run of all conversions in sws_ops and
89  * collecting all functions that need to be generated. This is achieved
90  * by running:
91  * make fate-sws-ops-entries-aarch64 GEN=1
92  */
94 #include "ops_entries.c"
95  { .op = AARCH64_SWS_OP_NONE }
96 };
97 
98 /*********************************************************************/
100 {
101  switch (fmt) {
102  case AARCH64_PIXEL_U8: return 1;
103  case AARCH64_PIXEL_U16: return 2;
104  case AARCH64_PIXEL_U32: return 4;
105  case AARCH64_PIXEL_F32: return 4;
106  default:
107  av_assert0(!"Invalid pixel type!");
108  break;
109  }
110  return 0;
111 }
112 
113 static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
114 {
115  buf_appendf(buf, size, "ff_sws");
116  const ParamField **fields = op_fields[params->op];
117  for (int i = 0; fields[i]; i++) {
118  const ParamField *field = fields[i];
119  void *p = (void *) (((uintptr_t) params) + field->offset);
120  field->print_str(buf, size, p);
121  }
122  buf_appendf(buf, size, "_neon");
123 }
124 
125 void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
126 {
127  impl_func_name(&buf, &size, params);
128  av_assert0(size && "string buffer exhausted");
129 }
130 
131 /*********************************************************************/
132 typedef struct SwsAArch64Context {
134 
135  /* SwsOpFunc arguments. */
142 
143  /* Loop iterator variables. */
146 
147  /* Scratch registers. */
150 
151  /* CPS-related variables. */
156 
157  /* Vector registers. Two banks (low and high) are used. */
158  RasmOp vl[ 4];
159  RasmOp vh[ 4];
160  RasmOp vt[12];
161 
162  /* Read/Write data pointers and padding. */
163  RasmOp in[4];
167 
168  /* Vector register dimensions. */
169  size_t el_size;
170  size_t el_count;
171  size_t vec_size;
172  bool use_vh;
174 
175 /*********************************************************************/
176 /* Helpers functions. */
177 
178 /* Looping when s->use_vh is set. */
179 #define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx)
180 #define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx)
181 #define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx)
182 
183 /* Inline rasm comments. */
184 #define CMT(comment) rasm_annotate(r, comment)
185 #define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__)
186 
187 /* Reshape all vector registers for current SwsOp. */
188 static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
189 {
190  s->vl[ 0] = a64op_make_vec( 0, el_count, el_size);
191  s->vl[ 1] = a64op_make_vec( 1, el_count, el_size);
192  s->vl[ 2] = a64op_make_vec( 2, el_count, el_size);
193  s->vl[ 3] = a64op_make_vec( 3, el_count, el_size);
194  s->vh[ 0] = a64op_make_vec( 4, el_count, el_size);
195  s->vh[ 1] = a64op_make_vec( 5, el_count, el_size);
196  s->vh[ 2] = a64op_make_vec( 6, el_count, el_size);
197  s->vh[ 3] = a64op_make_vec( 7, el_count, el_size);
198  s->vt[ 0] = a64op_make_vec(16, el_count, el_size);
199  s->vt[ 1] = a64op_make_vec(17, el_count, el_size);
200  s->vt[ 2] = a64op_make_vec(18, el_count, el_size);
201  s->vt[ 3] = a64op_make_vec(19, el_count, el_size);
202  s->vt[ 4] = a64op_make_vec(20, el_count, el_size);
203  s->vt[ 5] = a64op_make_vec(21, el_count, el_size);
204  s->vt[ 6] = a64op_make_vec(22, el_count, el_size);
205  s->vt[ 7] = a64op_make_vec(23, el_count, el_size);
206  s->vt[ 8] = a64op_make_vec(24, el_count, el_size);
207  s->vt[ 9] = a64op_make_vec(25, el_count, el_size);
208  s->vt[10] = a64op_make_vec(26, el_count, el_size);
209  s->vt[11] = a64op_make_vec(27, el_count, el_size);
210 }
211 
212 /*********************************************************************/
213 /* Function frame */
214 
215 static unsigned clobbered_frame_size(unsigned n)
216 {
217  return ((n + 1) >> 1) * 16;
218 }
219 
220 static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
221 {
222  RasmContext *r = s->rctx;
223  RasmOp sp = a64op_sp();
224  unsigned frame_size = clobbered_frame_size(n);
225  RasmOp sp_pre = a64op_pre(sp, -frame_size);
226 
227  rasm_add_comment(r, "prologue");
228  if (n == 0) {
229  /* no-op */
230  } else if (n == 1) {
231  i_str(r, regs[0], sp_pre);
232  } else {
233  i_stp(r, regs[0], regs[1], sp_pre);
234  for (unsigned i = 2; i + 1 < n; i += 2)
235  i_stp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
236  if (n & 1)
237  i_str(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
238  }
239 }
240 
241 static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
242 {
243  RasmContext *r = s->rctx;
244  RasmOp sp = a64op_sp();
245  unsigned frame_size = clobbered_frame_size(n);
246  RasmOp sp_post = a64op_post(sp, frame_size);
247 
248  rasm_add_comment(r, "epilogue");
249  if (n == 0) {
250  /* no-op */
251  } else if (n == 1) {
252  i_ldr(r, regs[0], sp_post);
253  } else {
254  if (n & 1)
255  i_ldr(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
256  for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2)
257  i_ldp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
258  i_ldp(r, regs[0], regs[1], sp_post);
259  }
260 }
261 
262 /*********************************************************************/
263 /* Callee-saved registers (r19-r28, fp, and lr). */
264 #define MAX_SAVED_REGS 12
265 
266 static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
267  RasmOp gpr)
268 {
269  const int n = a64op_gpr_n(gpr);
270  if (n >= 19 && n <= 30)
271  regs[(*count)++] = gpr;
272 }
273 
274 static unsigned clobbered_gprs(const SwsAArch64Context *s,
276  RasmOp regs[MAX_SAVED_REGS])
277 {
278  unsigned count = 0;
279  clobber_gpr(regs, &count, a64op_lr());
280  LOOP(mask, i) {
281  clobber_gpr(regs, &count, s->in[i]);
282  clobber_gpr(regs, &count, s->out[i]);
283  clobber_gpr(regs, &count, s->in_bump[i]);
284  clobber_gpr(regs, &count, s->out_bump[i]);
285  }
286  return count;
287 }
288 
290 {
291  RasmContext *r = s->rctx;
292  char func_name[128];
293  char buf[64];
294 
295  /**
296  * The process function for aarch64 works similarly to the x86 backend.
297  * The description in x86/ops_include.asm mostly holds as well here.
298  */
299 
300  snprintf(func_name, sizeof(func_name), "ff_sws_process_%04x_neon", mask);
301 
302  rasm_func_begin(r, func_name, true, false);
303 
304  /* Function prologue */
305  RasmOp saved_regs[MAX_SAVED_REGS];
306  unsigned nsaved = clobbered_gprs(s, mask, saved_regs);
307  if (nsaved)
308  asmgen_prologue(s, saved_regs, nsaved);
309 
310  /* Load values from impl. */
311  i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr op0_func = impl->cont;");
312  i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl)); CMT("SwsOpImpl *op1_impl = impl + 1;");
313 
314  /* Load values from exec. */
315  LOOP(mask, i) {
316  rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, i);
317  i_ldr(r, s->in[i], a64op_off(s->exec, offsetof_exec_in + (i * sizeof(uint8_t *))));
318  }
319  LOOP(mask, i) {
320  rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", i, i);
321  i_ldr(r, s->out[i], a64op_off(s->exec, offsetof_exec_out + (i * sizeof(uint8_t *))));
322  }
323  LOOP(mask, i) {
324  rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = exec->in_bump[%u];", i, i);
325  i_ldr(r, s->in_bump[i], a64op_off(s->exec, offsetof_exec_in_bump + (i * sizeof(ptrdiff_t))));
326  }
327  LOOP(mask, i) {
328  rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = exec->out_bump[%u];", i, i);
329  i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
330  }
331 
332  int first_row = rasm_new_label(r, NULL);
333  int next_row = rasm_new_label(r, NULL);
334  int next_block = rasm_new_label(r, NULL);
335 
336  /* Jump to first row (skips padding). */
337  i_b (r, rasm_op_label(first_row)); CMT("goto first_row;");
338 
339  /* Perform padding, preparing for next row. */
340  rasm_add_label(r, next_row); CMT("next_row:");
341  LOOP(mask, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
342  LOOP(mask, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
343 
344  /* First row (reset x). */
345  rasm_add_label(r, first_row); CMT("first_row:");
346  i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
347 
348  /* Reset impl and call first kernel. */
349  rasm_add_label(r, next_block); CMT("next_block:");
350  i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
351  i_blr(r, s->op0_func); CMT("op0_func();");
352 
353  /* Perform horizontal loop. */
354  i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
355  i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
356  i_bne(r, next_block); CMT(" goto next_block;");
357 
358  /* Perform vertical loop. */
359  i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
360  i_cmp(r, s->y, s->y_end); CMT("if (y != y_end)");
361  i_bne(r, next_row); CMT(" goto next_row;");
362 
363  /* Function epilogue */
364  if (nsaved)
365  asmgen_epilogue(s, saved_regs, nsaved);
366 
367  i_ret(r);
368 }
369 
370 /*********************************************************************/
371 /**
372  * Set node where the continuation address will be loaded and impl will
373  * be incremented. This should be done right after impl->priv has been
374  * used.
375  */
377 {
378  RasmContext *r = s->rctx;
379  s->load_cont_node = rasm_get_current_node(r);
380 }
381 
382 /*********************************************************************/
383 /* gather raw pixels from planes */
384 /* AARCH64_SWS_OP_READ_BIT */
385 /* AARCH64_SWS_OP_READ_NIBBLE */
386 /* AARCH64_SWS_OP_READ_PACKED */
387 /* AARCH64_SWS_OP_READ_PLANAR */
388 
390 {
391  RasmContext *r = s->rctx;
392  RasmOp bitmask_vec = s->vt[1];
393  RasmOp wtmp = a64op_w(s->tmp0);
394  AArch64VecViews vl[1];
395  AArch64VecViews vtmp;
396  AArch64VecViews shift_vec;
397 
398  a64op_vec_views(s->vt[0], &shift_vec);
399  a64op_vec_views(s->vl[0], &vl[0]);
400  a64op_vec_views(s->vt[2], &vtmp);
401 
402  /* Note that shift_vec has negative values, so that using it with
403  * ushl actually performs a right shift. */
404  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
405  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
407 
408  if (p->block_size == 16) {
409  i_ldrh(r, wtmp, a64op_post(s->in[0], 2)); CMT("uint16_t tmp = *in[0]++;");
410  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 16 times>};");
411  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
412  i_lsr (r, wtmp, wtmp, IMM(8)); CMT("tmp >>= 8;");
413  i_dup (r, vtmp.b8, wtmp); CMT("vtmp.lo = broadcast(tmp);");
414  i_ins (r, vl[0].de[1], vtmp.de[0]); CMT("vl[0].hi = vtmp.lo;");
415  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
416  i_and (r, vl[0].b16, vl[0].b16, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
417  } else {
418  i_ldrb(r, wtmp, a64op_post(s->in[0], 1)); CMT("uint8_t tmp = *in[0]++;");
419  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};");
420  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
421  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
422  i_and (r, vl[0].b8, vl[0].b8, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
423  }
424 }
425 
427 {
428  RasmContext *r = s->rctx;
429  RasmOp nibble_mask = v_8b(s->vt[0]);
430  AArch64VecViews vl[1];
431  AArch64VecViews vtmp;
432 
433  a64op_vec_views(s->vl[0], &vl[0]);
434  a64op_vec_views(s->vt[1], &vtmp);
435 
436  rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 <repeats 8 times>};");
437  i_movi(r, nibble_mask, IMM(0x0f));
438 
439  if (p->block_size == 8) {
440  i_ldr (r, vl[0].s, a64op_post(s->in[0], 4)); CMT("vl[0] = *in[0]++;");
441  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
442  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
443  i_zip1(r, vl[0].b8, vtmp.b8, vl[0].b8); CMT("interleave");
444  } else {
445  i_ldr (r, vl[0].d, a64op_post(s->in[0], 8)); CMT("vl[0] = *in[0]++;");
446  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
447  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
448  i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16); CMT("interleave");
449  }
450 }
451 
453 {
454  RasmContext *r = s->rctx;
455 
456  switch (p->mask) {
457  case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]), a64op_post(s->in[0], s->vec_size * 2)); break;
458  case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->in[0], s->vec_size * 3)); break;
459  case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->in[0], s->vec_size * 4)); break;
460  }
461 }
462 
464 {
465  av_assert0(p->mask != 0x0001);
466  asmgen_op_read_packed_n(s, p, s->vl);
467  if (s->use_vh)
468  asmgen_op_read_packed_n(s, p, s->vh);
469 }
470 
472 {
473  RasmContext *r = s->rctx;
474  AArch64VecViews vl[4];
475  AArch64VecViews vh[4];
476 
477  for (int i = 0; i < 4; i++) {
478  a64op_vec_views(s->vl[i], &vl[i]);
479  a64op_vec_views(s->vh[i], &vh[i]);
480  }
481 
482  LOOP_MASK(p, i) {
483  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
484  case 0x008: i_ldr(r, vl[i].d, a64op_post(s->in[i], s->vec_size * 1)); break;
485  case 0x010: i_ldr(r, vl[i].q, a64op_post(s->in[i], s->vec_size * 1)); break;
486  case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], s->vec_size * 2)); break;
487  case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], s->vec_size * 2)); break;
488  }
489  }
490 }
491 
492 /*********************************************************************/
493 /* write raw pixels to planes */
494 /* AARCH64_SWS_OP_WRITE_BIT */
495 /* AARCH64_SWS_OP_WRITE_NIBBLE */
496 /* AARCH64_SWS_OP_WRITE_PACKED */
497 /* AARCH64_SWS_OP_WRITE_PLANAR */
498 
500 {
501  RasmContext *r = s->rctx;
502  AArch64VecViews vl[1];
503  AArch64VecViews shift_vec;
504  AArch64VecViews vtmp0;
505  AArch64VecViews vtmp1;
506 
507  a64op_vec_views(s->vl[0], &vl[0]);
508  a64op_vec_views(s->vt[0], &shift_vec);
509  a64op_vec_views(s->vt[1], &vtmp0);
510  a64op_vec_views(s->vt[2], &vtmp1);
511 
512  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
513  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
515 
516  if (p->block_size == 8) {
517  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
518  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
519  i_str (r, vtmp0.b, a64op_post(s->out[0], 1)); CMT("*out[0]++ = vtmp0;");
520  } else {
521  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
522  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
523  i_ins (r, vtmp1.de[0], vl[0].de[1]); CMT("vtmp1.lo = vl[0].hi;");
524  i_addv(r, vtmp1.b, vtmp1.b8); CMT("vtmp1[0] = add_across(vtmp1);");
525  i_ins (r, vtmp0.be[1], vtmp1.be[0]); CMT("vtmp0[1] = vtmp1[0];");
526  i_str (r, vtmp0.h, a64op_post(s->out[0], 2)); CMT("*out[0]++ = vtmp0;");
527  }
528 }
529 
531 {
532  RasmContext *r = s->rctx;
533  AArch64VecViews vl[4];
534  AArch64VecViews vtmp0;
535  AArch64VecViews vtmp1;
536 
537  for (int i = 0; i < 4; i++)
538  a64op_vec_views(s->vl[i], &vl[i]);
539  a64op_vec_views(s->vt[0], &vtmp0);
540  a64op_vec_views(s->vt[1], &vtmp1);
541 
542  if (p->block_size == 8) {
543  i_shl (r, vtmp0.h4, vl[0].h4, IMM(4));
544  i_ushr(r, vtmp1.h4, vl[0].h4, IMM(8));
545  i_orr (r, vl[0].b8, vtmp0.b8, vtmp1.b8);
546  i_xtn (r, vtmp0.b8, vl[0].h8);
547  i_str (r, vtmp0.s, a64op_post(s->out[0], 4));
548  } else {
549  i_shl (r, vtmp0.h8, vl[0].h8, IMM(4));
550  i_ushr(r, vtmp1.h8, vl[0].h8, IMM(8));
551  i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16);
552  i_xtn (r, vtmp0.b8, vl[0].h8);
553  i_str (r, vtmp0.d, a64op_post(s->out[0], 8));
554  }
555 }
556 
558 {
559  RasmContext *r = s->rctx;
560 
561  switch (p->mask) {
562  case 0x0011: i_st2(r, vv_2(vx[0], vx[1]), a64op_post(s->out[0], s->vec_size * 2)); break;
563  case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->out[0], s->vec_size * 3)); break;
564  case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->out[0], s->vec_size * 4)); break;
565  }
566 }
567 
569 {
570  av_assert0(p->mask != 0x0001);
571  asmgen_op_write_packed_n(s, p, s->vl);
572  if (s->use_vh)
573  asmgen_op_write_packed_n(s, p, s->vh);
574 }
575 
577 {
578  RasmContext *r = s->rctx;
579  AArch64VecViews vl[4];
580  AArch64VecViews vh[4];
581 
582  for (int i = 0; i < 4; i++) {
583  a64op_vec_views(s->vl[i], &vl[i]);
584  a64op_vec_views(s->vh[i], &vh[i]);
585  }
586 
587  LOOP_MASK(p, i) {
588  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
589  case 0x008: i_str(r, vl[i].d, a64op_post(s->out[i], s->vec_size * 1)); break;
590  case 0x010: i_str(r, vl[i].q, a64op_post(s->out[i], s->vec_size * 1)); break;
591  case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], s->vec_size * 2)); break;
592  case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], s->vec_size * 2)); break;
593  }
594  }
595 }
596 
597 /*********************************************************************/
598 /* swap byte order (for differing endianness) */
599 /* AARCH64_SWS_OP_SWAP_BYTES */
600 
602 {
603  RasmContext *r = s->rctx;
604  AArch64VecViews vl[4];
605  AArch64VecViews vh[4];
606 
607  for (int i = 0; i < 4; i++) {
608  a64op_vec_views(s->vl[i], &vl[i]);
609  a64op_vec_views(s->vh[i], &vh[i]);
610  }
611 
612  switch (aarch64_pixel_size(p->type)) {
613  case sizeof(uint16_t):
614  LOOP_MASK (p, i) i_rev16(r, vl[i].b16, vl[i].b16);
615  LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16);
616  break;
617  case sizeof(uint32_t):
618  LOOP_MASK (p, i) i_rev32(r, vl[i].b16, vl[i].b16);
619  LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16);
620  break;
621  }
622 }
623 
624 /*********************************************************************/
625 /* rearrange channel order, or duplicate channels */
626 /* AARCH64_SWS_OP_SWIZZLE */
627 
628 #define SWIZZLE_TMP 0xf
629 
630 static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
631 {
632  if (n == SWIZZLE_TMP)
633  snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l');
634  else
635  snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n);
636  return buf;
637 }
638 #define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh)
639 
640 static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
641 {
642  if (n == SWIZZLE_TMP)
643  return s->vt[vh];
644  return vh ? s->vh[n] : s->vl[n];
645 }
646 
647 static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
648 {
649  RasmContext *r = s->rctx;
650  RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) };
651  RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) };
652 
653  i_mov (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 0), PRINT_SWIZZLE_V(src, 0));
654  if (s->use_vh) {
655  i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 1), PRINT_SWIZZLE_V(src, 1));
656  }
657 }
658 
660 {
661  /* Compute used vectors (src and dst) */
662  uint8_t src_used[4] = { 0 };
663  bool done[4] = { true, true, true, true };
664  LOOP_MASK(p, dst) {
665  uint8_t src = MASK_GET(p->swizzle, dst);
666  src_used[src]++;
667  done[dst] = false;
668  }
669 
670  /* First perform unobstructed copies. */
671  for (bool progress = true; progress; ) {
672  progress = false;
673  for (int dst = 0; dst < 4; dst++) {
674  if (done[dst] || src_used[dst])
675  continue;
676  uint8_t src = MASK_GET(p->swizzle, dst);
677  swizzle_emit(s, dst, src);
678  src_used[src]--;
679  done[dst] = true;
680  progress = true;
681  }
682  }
683 
684  /* Then swap and rotate remaining operations. */
685  for (int dst = 0; dst < 4; dst++) {
686  if (done[dst])
687  continue;
688 
690 
691  uint8_t cur_dst = dst;
692  uint8_t src = MASK_GET(p->swizzle, cur_dst);
693  while (src != dst) {
694  swizzle_emit(s, cur_dst, src);
695  done[cur_dst] = true;
696  cur_dst = src;
697  src = MASK_GET(p->swizzle, cur_dst);
698  }
699 
700  swizzle_emit(s, cur_dst, SWIZZLE_TMP);
701  done[cur_dst] = true;
702  }
703 }
704 
705 #undef SWIZZLE_TMP
706 
707 /*********************************************************************/
708 /* split tightly packed data into components */
709 /* AARCH64_SWS_OP_UNPACK */
710 
712 {
713  RasmContext *r = s->rctx;
714  RasmOp *vl = s->vl;
715  RasmOp *vh = s->vh;
716  RasmOp *vt = s->vt;
717  RasmOp mask_gpr = a64op_w(s->tmp0);
718  uint32_t mask_val[4] = { 0 };
719  uint8_t mask_idx[4] = { 0 };
720  uint8_t cur_vt = 0;
721 
722  const int offsets[4] = {
723  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
724  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
725  MASK_GET(p->pack, 3),
726  0
727  };
728 
729  /* Generate masks. */
730  rasm_add_comment(r, "generate masks");
731  LOOP_MASK(p, i) {
732  uint32_t val = (1u << MASK_GET(p->pack, i)) - 1;
733  for (int j = 0; j < 4; j++) {
734  if (mask_val[j] == val) {
735  mask_val[i] = mask_val[j];
736  mask_idx[i] = mask_idx[j];
737  break;
738  }
739  }
740  if (!mask_val[i]) {
741  /**
742  * All-one values in movi only work up to 8-bit, and then
743  * at full 16- or 32-bit, but not for intermediate values
744  * like 10-bit. In those cases, we use mov + dup instead.
745  */
746  if (val <= 0xff || val == 0xffff) {
747  i_movi(r, vt[cur_vt], IMM(val));
748  } else {
749  i_mov (r, mask_gpr, IMM(val));
750  i_dup (r, vt[cur_vt], mask_gpr);
751  }
752  mask_val[i] = val;
753  mask_idx[i] = cur_vt++;
754  }
755  }
756 
757  /* Loop backwards to avoid clobbering component 0. */
758  LOOP_MASK_BWD (p, i) {
759  if (offsets[i]) {
760  i_ushr (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", i, offsets[i]);
761  } else if (i) {
762  i_mov16b(r, vl[i], vl[0]); CMTF("vl[%u] = vl[0];", i);
763  }
764  }
765  LOOP_MASK_BWD_VH(s, p, i) {
766  if (offsets[i]) {
767  i_ushr (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", i, offsets[i]);
768  } else if (i) {
769  i_mov16b(r, vh[i], vh[0]); CMTF("vh[%u] = vh[0];", i);
770  }
771  }
772 
773  /* Apply masks. */
774  reshape_all_vectors(s, 16, 1);
775  LOOP_MASK_BWD (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); }
776  LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); }
777 }
778 
779 /*********************************************************************/
780 /* compress components into tightly packed data */
781 /* AARCH64_SWS_OP_PACK */
782 
784 {
785  RasmContext *r = s->rctx;
786  RasmOp *vl = s->vl;
787  RasmOp *vh = s->vh;
788 
789  const int offsets[4] = {
790  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
791  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
792  MASK_GET(p->pack, 3),
793  0
794  };
795  uint16_t offset_mask = 0;
796  LOOP_MASK(p, i) {
797  if (offsets[i])
798  MASK_SET(offset_mask, i, 1);
799  }
800 
801  /* Perform left shift. */
802  LOOP (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); CMTF("vl[%u] <<= %u;", i, offsets[i]); }
803  LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); CMTF("vh[%u] <<= %u;", i, offsets[i]); }
804 
805  /* Combine components. */
806  reshape_all_vectors(s, 16, 1);
807  LOOP_MASK (p, i) {
808  if (i != 0) {
809  i_orr (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i);
810  if (s->use_vh) {
811  i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i);
812  }
813  }
814  }
815 }
816 
817 /*********************************************************************/
818 /* logical left shift of raw pixel values */
819 /* AARCH64_SWS_OP_LSHIFT */
820 
822 {
823  RasmContext *r = s->rctx;
824  RasmOp *vl = s->vl;
825  RasmOp *vh = s->vh;
826 
827  LOOP_MASK (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] <<= %u;", i, p->shift); }
828  LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] <<= %u;", i, p->shift); }
829 }
830 
831 /*********************************************************************/
832 /* right shift of raw pixel values */
833 /* AARCH64_SWS_OP_RSHIFT */
834 
836 {
837  RasmContext *r = s->rctx;
838  RasmOp *vl = s->vl;
839  RasmOp *vh = s->vh;
840 
841  LOOP_MASK (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] >>= %u;", i, p->shift); }
842  LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] >>= %u;", i, p->shift); }
843 }
844 
845 /*********************************************************************/
846 /* clear pixel values */
847 /* AARCH64_SWS_OP_CLEAR */
848 
850 {
851  RasmContext *r = s->rctx;
852  RasmOp *vl = s->vl;
853  RasmOp *vh = s->vh;
854  RasmOp clear_vec = s->vt[0];
855 
856  /**
857  * TODO
858  * - pack elements in impl->priv and perform smaller loads
859  * - if only 1 element and not vh, load directly with ld1r
860  */
861 
862  i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 clear_vec = impl->priv.v128;");
864 
865  LOOP_MASK (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i)); CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
866  LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i)); CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
867 }
868 
869 /*********************************************************************/
870 /* convert (cast) between formats */
871 /* AARCH64_SWS_OP_CONVERT */
872 
874 {
875  RasmContext *r = s->rctx;
876  AArch64VecViews vl[4];
877  AArch64VecViews vh[4];
878 
879  /**
880  * Since each instruction in the convert operation needs specific
881  * element types, it is simpler to use arrangement specifiers for
882  * each operand instead of reshaping all vectors.
883  */
884 
885  for (int i = 0; i < 4; i++) {
886  a64op_vec_views(s->vl[i], &vl[i]);
887  a64op_vec_views(s->vh[i], &vh[i]);
888  }
889 
890  size_t src_el_size = s->el_size;
891  size_t dst_el_size = aarch64_pixel_size(p->to_type);
892 
893  /**
894  * This function assumes block_size is either 8 or 16, and that
895  * we're always using the most amount of vector registers possible.
896  * Therefore, u32 always uses the high vector bank.
897  */
898  if (p->type == AARCH64_PIXEL_F32) {
899  rasm_add_comment(r, "f32 -> u32");
900  LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4);
901  LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4);
902  }
903 
904  if (p->block_size == 8) {
905  if (src_el_size == 1 && dst_el_size > src_el_size) {
906  rasm_add_comment(r, "u8 -> u16");
907  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
908  src_el_size = 2;
909  } else if (src_el_size == 4 && dst_el_size < src_el_size) {
910  rasm_add_comment(r, "u32 -> u16");
911  LOOP_MASK(p, i) i_xtn (r, vl[i].h4, vl[i].s4);
912  LOOP_MASK(p, i) i_xtn (r, vh[i].h4, vh[i].s4);
913  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
914  src_el_size = 2;
915  }
916  if (src_el_size == 2 && dst_el_size == 4) {
917  rasm_add_comment(r, "u16 -> u32");
918  LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4, vl[i].h8);
919  LOOP_MASK(p, i) i_uxtl (r, vl[i].s4, vl[i].h4);
920  src_el_size = 4;
921  } else if (src_el_size == 2 && dst_el_size == 1) {
922  rasm_add_comment(r, "u16 -> u8");
923  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
924  src_el_size = 1;
925  }
926  } else /* if (p->block_size == 16) */ {
927  if (src_el_size == 1 && dst_el_size == 2) {
928  rasm_add_comment(r, "u8 -> u16");
929  LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8, vl[i].b16);
930  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
931  } else if (src_el_size == 2 && dst_el_size == 1) {
932  rasm_add_comment(r, "u16 -> u8");
933  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
934  LOOP_MASK(p, i) i_xtn (r, vh[i].b8, vh[i].h8);
935  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
936  }
937  }
938 
939  /* See comment above for high vector bank usage for u32. */
940  if (p->to_type == AARCH64_PIXEL_F32) {
941  rasm_add_comment(r, "u32 -> f32");
942  LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4);
943  LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4);
944  }
945 }
946 
947 /*********************************************************************/
948 /* expand integers to the full range */
949 /* AARCH64_SWS_OP_EXPAND */
950 
952 {
953  RasmContext *r = s->rctx;
954  RasmOp *vl = s->vl;
955  RasmOp *vh = s->vh;
956 
957  size_t src_el_size = s->el_size;
958  size_t dst_el_size = aarch64_pixel_size(p->to_type);
959  size_t dst_total_size = p->block_size * dst_el_size;
960  size_t dst_vec_size = FFMIN(dst_total_size, 16);
961 
962  if (!s->use_vh)
963  s->use_vh = (dst_vec_size != dst_total_size);
964 
965  if (src_el_size == 1) {
966  rasm_add_comment(r, "u8 -> u16");
967  reshape_all_vectors(s, 16, 1);
968  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
969  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
970  }
971  if (dst_el_size == 4) {
972  rasm_add_comment(r, "u16 -> u32");
973  reshape_all_vectors(s, 8, 2);
974  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
975  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
976  }
977 }
978 
979 /*********************************************************************/
980 /* numeric minimum */
981 /* AARCH64_SWS_OP_MIN */
982 
984 {
985  RasmContext *r = s->rctx;
986  RasmOp *vl = s->vl;
987  RasmOp *vh = s->vh;
988  RasmOp *vt = s->vt;
989  RasmOp min_vec = s->vt[4];
990 
991  i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 min_vec = impl->priv.v128;");
993  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i)); CMTF("v128 vmin%u = min_vec[%u];", i, i); }
994 
995  if (p->type == AARCH64_PIXEL_F32) {
996  LOOP_MASK (p, i) { i_fmin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
997  LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
998  } else {
999  LOOP_MASK (p, i) { i_umin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
1000  LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
1001  }
1002 }
1003 
1004 /*********************************************************************/
1005 /* numeric maximum */
1006 /* AARCH64_SWS_OP_MAX */
1007 
1009 {
1010  RasmContext *r = s->rctx;
1011  RasmOp *vl = s->vl;
1012  RasmOp *vh = s->vh;
1013  RasmOp *vt = s->vt;
1014  RasmOp max_vec = s->vt[4];
1015 
1016  i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 max_vec = impl->priv.v128;");
1018  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i)); CMTF("v128 vmax%u = max_vec[%u];", i, i); }
1019 
1020  if (p->type == AARCH64_PIXEL_F32) {
1021  LOOP_MASK (p, i) { i_fmax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1022  LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1023  } else {
1024  LOOP_MASK (p, i) { i_umax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1025  LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1026  }
1027 }
1028 
1029 /*********************************************************************/
1030 /* multiplication by scalar */
1031 /* AARCH64_SWS_OP_SCALE */
1032 
1034 {
1035  RasmContext *r = s->rctx;
1036  RasmOp *vl = s->vl;
1037  RasmOp *vh = s->vh;
1038  RasmOp priv_ptr = s->tmp0;
1039  RasmOp scale_vec = s->vt[0];
1040 
1041  i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv)); CMT("v128 *scale_vec_ptr = &impl->priv;");
1043  i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr)); CMT("v128 scale_vec = broadcast(*scale_vec_ptr);");
1044 
1045  if (p->type == AARCH64_PIXEL_F32) {
1046  LOOP_MASK (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1047  LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1048  } else {
1049  LOOP_MASK (p, i) { i_mul (r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1050  LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1051  }
1052 }
1053 
1054 /*********************************************************************/
1055 /* generalized linear affine transform */
1056 /* AARCH64_SWS_OP_LINEAR */
1057 
1058 /**
1059  * Performs one pass of the linear transform over a single vector bank
1060  * (low or high).
1061  */
1063  RasmOp *vt, RasmOp *vc,
1064  int save_mask, bool vh_pass)
1065 {
1066  RasmContext *r = s->rctx;
1067  /**
1068  * The intermediate registers for fmul+fadd (for when SWS_BITEXACT
1069  * is set) start from temp vector 4.
1070  */
1071  RasmOp *vtmp = &vt[4];
1072  RasmOp *vx = vh_pass ? s->vh : s->vl;
1073  char cvh = vh_pass ? 'h' : 'l';
1074 
1075  if (vh_pass && !s->use_vh)
1076  return;
1077 
1078  /**
1079  * Save rows that need to be used as input after they have been already
1080  * written to.
1081  */
1082  RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] };
1083  if (save_mask) {
1084  for (int i = 0; i < 4; i++) {
1085  if (MASK_GET(save_mask, i)) {
1086  src_vx[i] = vt[i];
1087  i_mov16b(r, vt[i], vx[i]); CMTF("vsrc[%u] = v%c[%u];", i, cvh, i);
1088  }
1089  }
1090  }
1091 
1092  /**
1093  * The non-zero coefficients have been packed in aarch64_setup_linear()
1094  * in sequential order into the individual lanes of the coefficient
1095  * vector registers. We must follow the same order of execution here.
1096  */
1097  int i_coeff = 0;
1098  LOOP_MASK(p, i) {
1099  bool first = true;
1100  RasmNode *pre_mul = rasm_get_current_node(r);
1101  for (int j = 0; j < 5; j++) {
1102  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1103  continue;
1104  bool is_offset = linear_index_is_offset(j);
1105  int src_j = linear_index_to_vx(j);
1106  RasmOp vsrc = src_vx[src_j];
1107  uint8_t vc_i = i_coeff / 4;
1108  uint8_t vc_j = i_coeff & 3;
1109  RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j);
1110  i_coeff++;
1111  if (first && is_offset) {
1112  i_dup (r, vx[i], vcoeff); CMTF("v%c[%u] = broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j);
1113  } else if (first && !is_offset) {
1114  if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) {
1115  i_mov16b(r, vx[i], vsrc); CMTF("v%c[%u] = vsrc[%u];", cvh, i, src_j);
1116  } else {
1117  i_fmul (r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] = vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1118  }
1119  } else if (!p->linear.fmla) {
1120  /**
1121  * Split the multiply-accumulate into fmul+fadd. All
1122  * multiplications are performed first into temporary
1123  * registers, and only then added to the destination,
1124  * to reduce the dependency chain.
1125  * There is no need to perform multiplications by 1.
1126  */
1127  if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) {
1128  pre_mul = rasm_set_current_node(r, pre_mul);
1129  i_fmul(r, vtmp[vc_j], vsrc, vcoeff); CMTF("vtmp[%u] = vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j);
1130  pre_mul = rasm_set_current_node(r, pre_mul);
1131  i_fadd(r, vx[i], vx[i], vtmp[vc_j]); CMTF("v%c[%u] += vtmp[%u];", cvh, i, vc_j);
1132  } else {
1133  i_fadd(r, vx[i], vx[i], vsrc); CMTF("v%c[%u] += vsrc[%u];", cvh, i, vc_j);
1134  }
1135  } else {
1136  /**
1137  * Most modern aarch64 cores have a fastpath for sequences
1138  * of fmla instructions. This means that even if the coefficient
1139  * is 1, it is still faster to use fmla by 1 instead of fadd.
1140  */
1141  i_fmla(r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] += vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1142  }
1143  first = false;
1144  }
1145  }
1146 }
1147 
1149 {
1150  RasmContext *r = s->rctx;
1151  RasmOp *vt = s->vt;
1152  RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp vector 8 */
1153  RasmOp ptr = s->tmp0;
1154  RasmOp coeff_veclist;
1155 
1156  /* Preload coefficients from impl->priv. */
1157  const int num_vregs = linear_num_vregs(p);
1158  av_assert0(num_vregs <= 4);
1159  switch (num_vregs) {
1160  case 1: coeff_veclist = vv_1(vc[0]); break;
1161  case 2: coeff_veclist = vv_2(vc[0], vc[1]); break;
1162  case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]); break;
1163  case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
1164  }
1165  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 *vcoeff_ptr = impl->priv.ptr;");
1167  i_ld1(r, coeff_veclist, a64op_base(ptr)); CMT("coeff_veclist = *vcoeff_ptr;");
1168 
1169  /* Compute mask for rows that must be saved before being overwritten. */
1170  uint16_t save_mask = 0;
1171  bool overwritten[4] = { false, false, false, false };
1172  LOOP_MASK(p, i) {
1173  for (int j = 0; j < 5; j++) {
1174  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1175  continue;
1176  bool is_offset = linear_index_is_offset(j);
1177  int src_j = linear_index_to_vx(j);
1178  if (!is_offset && overwritten[src_j])
1179  MASK_SET(save_mask, j - 1, 1);
1180  overwritten[i] = true;
1181  }
1182  }
1183 
1184  /* Perform linear passes for low and high vector banks. */
1185  linear_pass(s, p, vt, vc, save_mask, false);
1186  linear_pass(s, p, vt, vc, save_mask, true);
1187 }
1188 
1189 /*********************************************************************/
1190 /* add dithering noise */
1191 /* AARCH64_SWS_OP_DITHER */
1192 
1194 {
1195  RasmContext *r = s->rctx;
1196  RasmOp *vl = s->vl;
1197  RasmOp *vh = s->vh;
1198  RasmOp ptr = s->tmp0;
1199  RasmOp tmp1 = s->tmp1;
1200  RasmOp wtmp1 = a64op_w(tmp1);
1201  RasmOp dither_vl = s->vt[0];
1202  RasmOp dither_vh = s->vt[1];
1203  RasmOp bx64 = a64op_x(s->bx);
1204  RasmOp y64 = a64op_x(s->y);
1205 
1206  /**
1207  * For a description of the matrix buffer layout, read the comments
1208  * in aarch64_setup_dither() in aarch64/ops.c.
1209  */
1210 
1211  /**
1212  * Sort components by y_offset value so that we can start dithering
1213  * with the smallest value, and increment the pointer upwards for
1214  * each new offset. The dither matrix is over-allocated and may be
1215  * over-read at the top, but it cannot be over-read before the start
1216  * of the buffer. Since we only mask the y offset once, this would
1217  * be an issue if we tried to subtract a value larger than the
1218  * initial y_offset.
1219  */
1220  int sorted[4];
1221  int n_comps = 0;
1222  /* Very cheap bucket sort. */
1223  int max_offset = 0;
1224  LOOP_MASK(p, i)
1225  max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i));
1226  for (int y_off = 0; y_off <= max_offset; y_off++) {
1227  LOOP_MASK(p, i) {
1228  if (MASK_GET(p->dither.y_offset, i) == y_off)
1229  sorted[n_comps++] = i;
1230  }
1231  }
1232 
1233  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("void *ptr = impl->priv.ptr;");
1235 
1236  /**
1237  * We use ubfiz to mask and shift left in one single instruction:
1238  * ubfiz <Wd>, <Wn>, #<lsb>, #<width>
1239  * Wd = (Wn & ((1 << width) - 1)) << lsb;
1240  *
1241  * Given:
1242  * block_size = 8, log2(block_size) = 3
1243  * dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111
1244  * sizeof(float) = 4, log2(sizeof(float)) = 2
1245  *
1246  * Suppose we have bx = 0bvvvv. To get x, we left shift by
1247  * log2(block_size) and end up with 0bvvvv000. Then we mask against
1248  * dither_mask, and end up with 0bv000. Finally we multiply by
1249  * sizeof(float), which is the same as shifting left by
1250  * log2(sizeof(float)). The result is 0bv00000.
1251  *
1252  * Therefore:
1253  * width = log2(dither_size) - log2(block_size)
1254  * lsb = log2(block_size) + log2(sizeof(float))
1255  */
1256  const int block_size_log2 = (p->block_size == 16) ? 4 : 3;
1257  const int dither_size_log2 = p->dither.size_log2;
1258  const int sizeof_float_log2 = 2;
1259  if (dither_size_log2 != block_size_log2) {
1260  RasmOp lsb = IMM(block_size_log2 + sizeof_float_log2);
1261  RasmOp width = IMM(dither_size_log2 - block_size_log2);
1262  i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / block_size) - 1)) * block_size * sizeof(float);");
1263  i_add (r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1264  }
1265 
1266  int last_y_off = -1;
1267  int prev_i = 0;
1268  for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) {
1269  int i = sorted[sorted_i];
1270  uint8_t y_off = MASK_GET(p->dither.y_offset, i);
1271  bool do_load = (y_off != last_y_off);
1272 
1273  if (last_y_off < 0) {
1274  /* On the first run, calculate pointer inside dither_matrix. */
1275  RasmOp lsb = IMM(dither_size_log2 + sizeof_float_log2);
1276  RasmOp width = IMM(dither_size_log2);
1277  /**
1278  * The ubfiz instruction for the y offset performs masking
1279  * by the dither matrix size and shifts by the stride.
1280  */
1281  if (y_off == 0) {
1282  i_ubfiz(r, tmp1, y64, lsb, width); CMT("tmp1 = (y & (dither_size - 1)) * dither_size * sizeof(float);");
1283  } else {
1284  i_add (r, wtmp1, s->y, IMM(y_off)); CMTF("tmp1 = y + y_off[%u];", i);
1285  i_ubfiz(r, tmp1, tmp1, lsb, width); CMT("tmp1 = (tmp1 & (dither_size - 1)) * dither_size * sizeof(float);");
1286  }
1287  i_add(r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1288  } else if (do_load) {
1289  /**
1290  * On subsequent runs, just increment the pointer.
1291  * The matrix is over-allocated, so we don't risk
1292  * overreading.
1293  */
1294  int delta = (y_off - last_y_off) * (1 << dither_size_log2) * sizeof(float);
1295  i_add(r, ptr, ptr, IMM(delta)); CMTF("ptr += (y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i);
1296  }
1297 
1298  if (do_load) {
1299  RasmOp dither_vlq = v_q(dither_vl);
1300  RasmOp dither_vhq = v_q(dither_vh);
1301  i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ ditherl, ditherh } = *ptr;");
1302  }
1303 
1304  i_fadd (r, vl[i], vl[i], dither_vl); CMTF("vl[%u] += vditherl;", i);
1305  if (s->use_vh) {
1306  i_fadd(r, vh[i], vh[i], dither_vh); CMTF("vh[%u] += vditherh;", i);
1307  }
1308 
1309  last_y_off = y_off;
1310  prev_i = i;
1311  }
1312 }
1313 
1314 /*********************************************************************/
1316 {
1317  RasmContext *r = s->rctx;
1318 
1319  bool is_read = false;
1320  bool is_write = false;
1321  switch (p->op) {
1326  is_read = true;
1327  break;
1332  is_write = true;
1333  break;
1334  default:
1335  break;
1336  }
1337 
1338  char func_name[128];
1339  aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
1340  rasm_func_begin(r, func_name, true, !is_read);
1341 
1342  /**
1343  * Set up vector register dimensions and reshape all vectors
1344  * accordingly.
1345  */
1346  size_t el_size = aarch64_pixel_size(p->type);
1347  size_t total_size = p->block_size * el_size;
1348 
1349  s->vec_size = FFMIN(total_size, 16);
1350  s->use_vh = (s->vec_size != total_size);
1351 
1352  s->el_size = el_size;
1353  s->el_count = s->vec_size / el_size;
1354  reshape_all_vectors(s, s->el_count, el_size);
1355 
1356  /* Common start for continuation-passing style (CPS) functions. */
1358 
1359  switch (p->op) {
1370  case AARCH64_SWS_OP_UNPACK: asmgen_op_unpack(s, p); break;
1371  case AARCH64_SWS_OP_PACK: asmgen_op_pack(s, p); break;
1372  case AARCH64_SWS_OP_LSHIFT: asmgen_op_lshift(s, p); break;
1373  case AARCH64_SWS_OP_RSHIFT: asmgen_op_rshift(s, p); break;
1374  case AARCH64_SWS_OP_CLEAR: asmgen_op_clear(s, p); break;
1376  case AARCH64_SWS_OP_EXPAND: asmgen_op_expand(s, p); break;
1377  case AARCH64_SWS_OP_MIN: asmgen_op_min(s, p); break;
1378  case AARCH64_SWS_OP_MAX: asmgen_op_max(s, p); break;
1379  case AARCH64_SWS_OP_SCALE: asmgen_op_scale(s, p); break;
1380  case AARCH64_SWS_OP_LINEAR: asmgen_op_linear(s, p); break;
1381  case AARCH64_SWS_OP_DITHER: asmgen_op_dither(s, p); break;
1382  /* TODO implement AARCH64_SWS_OP_SHUFFLE */
1383  default:
1384  break;
1385  }
1386 
1387  if (is_write) {
1388  /* Write functions return directly. */
1389  i_ret(r);
1390  } else {
1391  /* Load continuation address and increment impl pointer. */
1392  RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
1393  RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
1394  i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
1395  rasm_set_current_node(r, node);
1396  /* Common end for remaining CPS functions. */
1397  i_br (r, s->cont); CMT("jump to cont");
1398  }
1399 }
1400 
1401 /*********************************************************************/
1402 static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params,
1403  const SwsAArch64OpImplParams *prev, const char *p_str)
1404 {
1405  int first_diff = 0;
1406  int prev_levels = 0;
1407  int levels = 0;
1408 
1409  /* Compute number of current levels. */
1410  if (params) {
1411  const ParamField **fields = op_fields[params->op];
1412  while (fields[levels])
1413  levels++;
1414  }
1415 
1416  /* Compute number of previous levels. */
1417  if (prev) {
1418  const ParamField **prev_fields = op_fields[prev->op];
1419  while (prev_fields[prev_levels])
1420  prev_levels++;
1421  }
1422 
1423  /* Walk up and check the conditions that match. */
1424  if (params && prev) {
1425  const ParamField **fields = op_fields[params->op];
1426  first_diff = -1;
1427  for (int i = 0; fields[i]; i++) {
1428  const ParamField *field = fields[i];
1429  if (first_diff < 0) {
1430  int diff = field->cmp_val((void *) (((uintptr_t) params) + field->offset),
1431  (void *) (((uintptr_t) prev) + field->offset));
1432  if (diff)
1433  first_diff = i;
1434  }
1435  }
1436  }
1437 
1438  /* Walk back closing conditions. */
1439  if (prev) {
1440  for (int i = prev_levels - 1; i > first_diff; i--) {
1441  buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), "");
1442  buf_appendf(&buf, &size, "%*s}\n", 4 * i, "");
1443  }
1444  }
1445 
1446  /* Walk up adding conditions to return current function. */
1447  if (params) {
1448  const ParamField **fields = op_fields[params->op];
1449  for (int i = first_diff; i < levels; i++) {
1450  const ParamField *field = fields[i];
1451  void *p = (void *) (((uintptr_t) params) + field->offset);
1452  buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", p_str, field->name);
1453  field->print_val(&buf, &size, p);
1454  buf_appendf(&buf, &size, ")");
1455  if (i == (levels - 1)) {
1456  buf_appendf(&buf, &size, " return ");
1457  impl_func_name(&buf, &size, params);
1458  buf_appendf(&buf, &size, ";\n");
1459  } else {
1460  buf_appendf(&buf, &size, " {\n");
1461  }
1462  }
1463  }
1464 
1465  av_assert0(size && "string buffer exhausted");
1466 }
1467 
1468 static int lookup_gen(void)
1469 {
1470  char buf[1024];
1471 
1472  /**
1473  * The lookup function matches the SwsAArch64OpImplParams from
1474  * ops_entries.c to the exported functions generated by asmgen_op().
1475  * Each call to aarch64_op_impl_lookup_str() generates a code
1476  * fragment to uniquely detect the current function, opening and/or
1477  * closing conditions depending on the parameters of the previous
1478  * function.
1479  */
1480 
1481  /* External function declarations. */
1482  printf("#include \"libswscale/aarch64/ops_lookup.h\"\n");
1483  printf("\n");
1484  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1485  aarch64_op_impl_func_name(buf, sizeof(buf), p);
1486  printf("extern void %s(void);\n", buf);
1487  }
1488  printf("\n");
1489 
1490  /* Lookup function. */
1491  printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)\n");
1492  printf("{\n");
1493  const SwsAArch64OpImplParams *prev = NULL;
1494  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1495  aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->");
1496  printf("%s", buf);
1497  prev = p;
1498  }
1499  aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->");
1500  printf("%s", buf);
1501  printf(" return NULL;\n");
1502  printf("}\n");
1503 
1504  return 0;
1505 }
1506 
1507 /*********************************************************************/
1508 
1509 /* Generate all functions described by ops_entries.c */
1510 static int asmgen(void)
1511 {
1512  RasmContext *rctx = rasm_alloc();
1513  if (!rctx)
1514  return AVERROR(ENOMEM);
1515 
1516  SwsAArch64Context s = { .rctx = rctx };
1517  int ret;
1518 
1519  /**
1520  * The entry point of the SwsOpFunc is the `process` function. The
1521  * first kernel function is called from `process`, and subsequent
1522  * kernel functions are chained by directly branching to the next
1523  * operation, using a continuation-passing style design. The last
1524  * operation must be a write operation, which returns from the call
1525  * to the `process` function.
1526  *
1527  * The GPRs used by the entire call-chain are listed below.
1528  *
1529  * Function arguments are passed in r0-r5. After the parameters
1530  * from `exec` have been read, r0 is reused to branch to the
1531  * continuation functions. After the original parameters from
1532  * `impl` have been computed, r1 is reused as the `impl` pointer
1533  * for each operation.
1534  *
1535  * Loop iterators are r6 for `bx` and r3 for `y`, reused from
1536  * `y_start`, which doesn't need to be preserved.
1537  *
1538  * The intra-procedure-call temporary registers (r16 and r17) are
1539  * used as scratch registers. They may be used by call veneers and
1540  * PLT code inserted by the linker, so we cannot expect them to
1541  * persist across branches between functions.
1542  *
1543  * The Platform Register (r18) is not used.
1544  *
1545  * The read/write data pointers and padding values first use up the
1546  * remaining free caller-saved registers, and only then are the
1547  * caller-saved registers (r19-r28) used.
1548  *
1549  * The Link Register (r30) is used when calling the first kernel,
1550  * so it must be saved.
1551  */
1552 
1553  /* SwsOpFunc arguments. */
1554  s.exec = a64op_gpx(0); // const SwsOpExec *exec
1555  s.impl = a64op_gpx(1); // const void *priv
1556  s.bx_start = a64op_gpw(2); // int bx_start
1557  s.y_start = a64op_gpw(3); // int y_start
1558  s.bx_end = a64op_gpw(4); // int bx_end
1559  s.y_end = a64op_gpw(5); // int y_end
1560 
1561  /* Loop iterator variables. */
1562  s.bx = a64op_gpw(6);
1563  s.y = s.y_start; /* Reused from SwsOpFunc argument. */
1564 
1565  /* Scratch registers. */
1566  s.tmp0 = a64op_gpx(16); /* IP0 */
1567  s.tmp1 = a64op_gpx(17); /* IP1 */
1568 
1569  /* CPS-related variables. */
1570  s.op0_func = a64op_gpx(7);
1571  s.op1_impl = a64op_gpx(8);
1572  s.cont = s.exec; /* Reused from SwsOpFunc argument. */
1573 
1574  /* Read/Write data pointers and padding. */
1575  s.in [0] = a64op_gpx(9);
1576  s.out [0] = a64op_gpx(10);
1577  s.in_bump [0] = a64op_gpx(11);
1578  s.out_bump[0] = a64op_gpx(12);
1579  s.in [1] = a64op_gpx(13);
1580  s.out [1] = a64op_gpx(14);
1581  s.in_bump [1] = a64op_gpx(15);
1582  s.out_bump[1] = a64op_gpx(19);
1583  s.in [2] = a64op_gpx(20);
1584  s.out [2] = a64op_gpx(21);
1585  s.in_bump [2] = a64op_gpx(22);
1586  s.out_bump[2] = a64op_gpx(23);
1587  s.in [3] = a64op_gpx(24);
1588  s.out [3] = a64op_gpx(25);
1589  s.in_bump [3] = a64op_gpx(26);
1590  s.out_bump[3] = a64op_gpx(27);
1591 
1592  /* Generate all process functions using rasm. */
1593  asmgen_process(&s, 0x0001);
1594  asmgen_process(&s, 0x0011);
1595  asmgen_process(&s, 0x0111);
1596  asmgen_process(&s, 0x1111);
1597 
1598  /* Generate all functions from ops_entries.c using rasm. */
1599  const SwsAArch64OpImplParams *params = impl_params;
1600  while (params->op) {
1601  asmgen_op_cps(&s, params++);
1602  if (rctx->error) {
1603  ret = rctx->error;
1604  goto error;
1605  }
1606  }
1607 
1608  /* Print all rasm functions to stdout. */
1609  printf("#include \"libavutil/aarch64/asm.S\"\n");
1610  printf("\n");
1611  ret = rasm_print(s.rctx, stdout);
1612 
1613 error:
1614  rasm_free(&s.rctx);
1615  return ret;
1616 }
1617 
1618 /*********************************************************************/
1619 int main(int argc, char *argv[])
1620 {
1621  bool lookup = false;
1622  bool ops = false;
1623 
1624 #ifdef _WIN32
1625  _setmode(_fileno(stdout), _O_BINARY);
1626 #endif
1627 
1628  for (int i = 1; i < argc; i++) {
1629  if (!strcmp(argv[i], "-ops"))
1630  ops = true;
1631  else if (!strcmp(argv[i], "-lookup"))
1632  lookup = true;
1633  }
1634  if ((lookup && ops) || (!lookup && !ops)) {
1635  fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n");
1636  return -1;
1637  }
1638 
1639  return lookup ? lookup_gen() : asmgen();
1640 }
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
asmgen_op_write_planar
static void asmgen_op_write_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:576
AARCH64_SWS_OP_MIN
@ AARCH64_SWS_OP_MIN
Definition: ops_impl.h:57
AArch64VecViews::h8
RasmOp h8
Definition: rasm.h:466
linear_index_to_vx
static int linear_index_to_vx(int idx)
Definition: ops_impl.h:155
rasm_print.c
FF_DYNARRAY_ADD
#define FF_DYNARRAY_ADD(av_size_max, av_elt_size, av_array, av_size, av_success, av_failure)
Add an element to a dynamic array.
Definition: dynarray.h:45
LINEAR_MASK_GET
#define LINEAR_MASK_GET(mask, idx, jdx)
Definition: ops_impl.h:122
SwsAArch64Context::vh
RasmOp vh[4]
Definition: ops_asmgen.c:159
rasm_alloc
RasmContext * rasm_alloc(void)
Definition: rasm.c:32
r
const char * r
Definition: vf_curves.c:127
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
printf
__device__ int printf(const char *,...)
main
int main(int argc, char *argv[])
Definition: ops_asmgen.c:1619
LOOP_MASK_BWD_VH
#define LOOP_MASK_BWD_VH(s, p, idx)
Definition: ops_asmgen.c:181
i_ld1
#define i_ld1(rctx, op0, op1)
Definition: rasm.h:557
LOOP_MASK_BWD
#define LOOP_MASK_BWD(p, idx)
Definition: ops_impl.h:120
ParamField
The following structure is used to describe one field from SwsAArch64OpImplParams.
Definition: ops_impl.c:186
reshape_all_vectors
static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
Definition: ops_asmgen.c:188
a64op_base
static RasmOp a64op_base(RasmOp op)
Definition: rasm.h:498
AArch64VecViews::b16
RasmOp b16
Definition: rasm.h:464
i_zip1
#define i_zip1(rctx, op0, op1, op2)
Definition: rasm.h:596
AArch64VecViews::d
RasmOp d
Definition: rasm.h:460
i_ld4
#define i_ld4(rctx, op0, op1)
Definition: rasm.h:561
i_mul
#define i_mul(rctx, op0, op1, op2)
Definition: rasm.h:569
a64op_gpx
static RasmOp a64op_gpx(uint8_t n)
Definition: rasm.h:354
av_dynarray2_add
static void * av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size, const uint8_t *elem_data)
Definition: ops_asmgen.c:65
asmgen_op_read_packed
static void asmgen_op_read_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:463
a64op_w
static RasmOp a64op_w(RasmOp op)
Definition: rasm.h:359
RasmContext::error
int error
Definition: rasm.h:191
clobber_gpr
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count, RasmOp gpr)
Definition: ops_asmgen.c:266
mask
int mask
Definition: mediacodecdec_common.c:154
SwsAArch64Context::out_bump
RasmOp out_bump[4]
Definition: ops_asmgen.c:166
rasm_free
void rasm_free(RasmContext **prctx)
Definition: rasm.c:37
i_blr
#define i_blr(rctx, op0)
Definition: rasm.h:545
rasm_set_current_node
RasmNode * rasm_set_current_node(RasmContext *rctx, RasmNode *node)
Definition: rasm.c:199
i_st4
#define i_st4(rctx, op0, op1)
Definition: rasm.h:578
u
#define u(width, name, range_min, range_max)
Definition: cbs_apv.c:68
asmgen_op_max
static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1008
AArch64VecViews
This helper structure is used to mimic the assembler syntax for vector register modifiers.
Definition: rasm.h:455
SwsAArch64Context::rctx
RasmContext * rctx
Definition: ops_asmgen.c:133
rasm_get_current_node
RasmNode * rasm_get_current_node(RasmContext *rctx)
Definition: rasm.c:194
asmgen_op_write_bit
static void asmgen_op_write_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:499
a64op_gpw
static RasmOp a64op_gpw(uint8_t n)
Definition: rasm.h:353
i_ld3
#define i_ld3(rctx, op0, op1)
Definition: rasm.h:560
vv_2
static RasmOp vv_2(RasmOp op0, RasmOp op1)
Definition: rasm.h:446
vv_3
static RasmOp vv_3(RasmOp op0, RasmOp op1, RasmOp op2)
Definition: rasm.h:447
CMTF
#define CMTF(fmt,...)
Definition: ops_asmgen.c:185
PRINT_SWIZZLE_V
#define PRINT_SWIZZLE_V(n, vh)
Definition: ops_asmgen.c:638
AARCH64_SWS_OP_SWIZZLE
@ AARCH64_SWS_OP_SWIZZLE
Definition: ops_impl.h:49
i_dup
#define i_dup(rctx, op0, op1)
Definition: rasm.h:549
asmgen_op_dither
static void asmgen_op_dither(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1193
SwsAArch64Context::y
RasmOp y
Definition: ops_asmgen.c:145
i_bne
#define i_bne(rctx, id)
Definition: rasm.h:601
rasm_print
void int rasm_print(RasmContext *rctx, FILE *fp)
Definition: rasm_print.c:432
SwsAArch64Context::out
RasmOp out[4]
Definition: ops_asmgen.c:164
i_ld2
#define i_ld2(rctx, op0, op1)
Definition: rasm.h:559
i_fmla
#define i_fmla(rctx, op0, op1, op2)
Definition: rasm.h:554
asmgen_op_read_bit
static void asmgen_op_read_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:389
RasmNode
Definition: rasm.h:144
i_rev16
#define i_rev16(rctx, op0, op1)
Definition: rasm.h:572
ops_impl.c
asmgen_op_write_nibble
static void asmgen_op_write_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:530
IMM
#define IMM(val)
Definition: rasm.h:91
AARCH64_SWS_OP_CLEAR
@ AARCH64_SWS_OP_CLEAR
Definition: ops_impl.h:54
SwsAArch64Context::el_size
size_t el_size
Definition: ops_asmgen.c:169
clobbered_gprs
static unsigned clobbered_gprs(const SwsAArch64Context *s, SwsAArch64OpMask mask, RasmOp regs[MAX_SAVED_REGS])
Definition: ops_asmgen.c:274
i_fmin
#define i_fmin(rctx, op0, op1, op2)
Definition: rasm.h:553
SwsAArch64Context::bx_start
RasmOp bx_start
Definition: ops_asmgen.c:138
AARCH64_SWS_OP_NONE
@ AARCH64_SWS_OP_NONE
Definition: ops_impl.h:39
LOOP_MASK_VH
#define LOOP_MASK_VH(s, p, idx)
Definition: ops_asmgen.c:180
a64op_vec_views
void a64op_vec_views(RasmOp op, AArch64VecViews *out)
Definition: rasm.c:330
SwsAArch64Context::bx_end
RasmOp bx_end
Definition: ops_asmgen.c:140
AARCH64_SWS_OP_READ_NIBBLE
@ AARCH64_SWS_OP_READ_NIBBLE
Definition: ops_impl.h:41
SwsAArch64Context
Definition: ops_asmgen.c:132
AArch64VecViews::s
RasmOp s
Definition: rasm.h:459
AARCH64_SWS_OP_PACK
@ AARCH64_SWS_OP_PACK
Definition: ops_impl.h:51
AARCH64_SWS_OP_SWAP_BYTES
@ AARCH64_SWS_OP_SWAP_BYTES
Definition: ops_impl.h:48
AARCH64_SWS_OP_READ_BIT
@ AARCH64_SWS_OP_READ_BIT
Definition: ops_impl.h:40
i_st2
#define i_st2(rctx, op0, op1)
Definition: rasm.h:576
SwsAArch64Context::impl
RasmOp impl
Definition: ops_asmgen.c:137
i_st3
#define i_st3(rctx, op0, op1)
Definition: rasm.h:577
i_ushr
#define i_ushr(rctx, op0, op1, op2)
Definition: rasm.h:592
RasmOp
Runtime assembler for AArch64.
Definition: rasm.h:43
AARCH64_SWS_OP_MAX
@ AARCH64_SWS_OP_MAX
Definition: ops_impl.h:58
asmgen_op_read_nibble
static void asmgen_op_read_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:426
i_addv
#define i_addv(rctx, op0, op1)
Definition: rasm.h:540
asmgen_op_clear
static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:849
swizzle_a64op
static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:640
val
static double val(void *priv, double ch)
Definition: aeval.c:77
rasm_add_label
RasmNode * rasm_add_label(RasmContext *rctx, int id)
Definition: rasm.c:146
i_fadd
#define i_fadd(rctx, op0, op1, op2)
Definition: rasm.h:550
asmgen_op_pack
static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:783
LINEAR_MASK_1
#define LINEAR_MASK_1
Definition: ops_impl.h:127
i_ld1r
#define i_ld1r(rctx, op0, op1)
Definition: rasm.h:558
SwsAArch64Context::y_start
RasmOp y_start
Definition: ops_asmgen.c:139
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
SwsAArch64Context::exec
RasmOp exec
Definition: ops_asmgen.c:136
vv_1
static RasmOp vv_1(RasmOp op0)
Definition: rasm.h:445
a64op_elem
static RasmOp a64op_elem(RasmOp op, uint8_t idx)
Definition: rasm.h:421
FFMIN
#define FFMIN(a, b)
Definition: ops_asmgen.c:50
float
float
Definition: af_crystalizer.c:122
AArch64VecViews::be
RasmOp be[2]
Definition: rasm.h:471
AARCH64_SWS_OP_WRITE_NIBBLE
@ AARCH64_SWS_OP_WRITE_NIBBLE
Definition: ops_impl.h:45
AArch64VecViews::b
RasmOp b
Definition: rasm.h:457
SwsAArch64OpMask
uint16_t SwsAArch64OpMask
Definition: ops_impl.h:66
AARCH64_SWS_OP_DITHER
@ AARCH64_SWS_OP_DITHER
Definition: ops_impl.h:61
AARCH64_SWS_OP_RSHIFT
@ AARCH64_SWS_OP_RSHIFT
Definition: ops_impl.h:53
s
#define s(width, name)
Definition: cbs_vp9.c:198
offsets
static const int offsets[]
Definition: hevc_pel.c:34
SwsAArch64Context::op1_impl
RasmOp op1_impl
Definition: ops_asmgen.c:153
impl_func_name
static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:113
AARCH64_SWS_OP_LINEAR
@ AARCH64_SWS_OP_LINEAR
Definition: ops_impl.h:60
LOOP
#define LOOP(mask, idx)
Definition: ops_impl.h:112
frame_size
int frame_size
Definition: mxfenc.c:2489
asmgen_op_cps
static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1315
FFMAX
#define FFMAX(a, b)
Definition: ops_asmgen.c:49
SwsAArch64Context::vl
RasmOp vl[4]
Definition: ops_asmgen.c:158
offsetof_impl_cont
#define offsetof_impl_cont
Definition: ops_impl.h:173
CMT
#define CMT(comment)
Definition: ops_asmgen.c:184
linear_pass
static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vt, RasmOp *vc, int save_mask, bool vh_pass)
Performs one pass of the linear transform over a single vector bank (low or high).
Definition: ops_asmgen.c:1062
ops_entries.c
AARCH64_SWS_OP_CONVERT
@ AARCH64_SWS_OP_CONVERT
Definition: ops_impl.h:55
limits.h
i_ins
#define i_ins(rctx, op0, op1)
Definition: rasm.h:556
asmgen_op_convert
static void asmgen_op_convert(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:873
field
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this field
Definition: writing_filters.txt:78
v_8b
static RasmOp v_8b(RasmOp op)
Definition: rasm.h:436
i_ldr
#define i_ldr(rctx, op0, op1)
Definition: rasm.h:563
a64op_make_vec
static RasmOp a64op_make_vec(uint8_t n, uint8_t el_count, uint8_t el_size)
Definition: rasm.h:365
op_fields
static const ParamField * op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS]
Definition: ops_impl.c:323
AARCH64_PIXEL_F32
@ AARCH64_PIXEL_F32
Definition: ops_impl.h:33
asmgen_op_scale
static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1033
asmgen
static int asmgen(void)
Definition: ops_asmgen.c:1510
fields
the definition of that something depends on the semantic of the filter The callback must examine the status of the filter s links and proceed accordingly The status of output links is stored in the status_in and status_out fields and tested by the then the processing requires a frame on this link and the filter is expected to make efforts in that direction The status of input links is stored by the fifo and status_out fields
Definition: filter_design.txt:155
aarch64_op_impl_func_name
void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:125
aarch64_op_impl_lookup_str
static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params, const SwsAArch64OpImplParams *prev, const char *p_str)
Definition: ops_asmgen.c:1402
i_br
#define i_br(rctx, op0)
Definition: rasm.h:546
i_cmp
#define i_cmp(rctx, op0, op1)
Definition: rasm.h:547
AARCH64_SWS_OP_SCALE
@ AARCH64_SWS_OP_SCALE
Definition: ops_impl.h:59
offsetof_exec_out_bump
#define offsetof_exec_out_bump
Definition: ops_impl.h:172
NULL
#define NULL
Definition: coverity.c:32
aarch64_pixel_size
static size_t aarch64_pixel_size(SwsAArch64PixelType fmt)
Definition: ops_asmgen.c:99
impl_params
static const SwsAArch64OpImplParams impl_params[]
Implementation parameters for all exported functions.
Definition: ops_asmgen.c:93
asmgen_op_unpack
static void asmgen_op_unpack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:711
i_fmul
#define i_fmul(rctx, op0, op1, op2)
Definition: rasm.h:555
asmgen_op_write_packed_n
static void asmgen_op_write_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:557
a64op_post
static RasmOp a64op_post(RasmOp op, int16_t imm)
Definition: rasm.h:501
AARCH64_SWS_OP_READ_PACKED
@ AARCH64_SWS_OP_READ_PACKED
Definition: ops_impl.h:42
i_umin
#define i_umin(rctx, op0, op1, op2)
Definition: rasm.h:587
MAX_SAVED_REGS
#define MAX_SAVED_REGS
Definition: ops_asmgen.c:264
LOOP_VH
#define LOOP_VH(s, mask, idx)
Definition: ops_asmgen.c:179
offsetof_exec_out
#define offsetof_exec_out
Definition: ops_impl.h:170
sizeof_impl
#define sizeof_impl
Definition: ops_impl.h:175
i_add
#define i_add(rctx, op0, op1, op2)
Definition: rasm.h:539
asmgen_op_read_planar
static void asmgen_op_read_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:471
rasm.c
asmgen_process
static void asmgen_process(SwsAArch64Context *s, SwsAArch64OpMask mask)
Definition: ops_asmgen.c:289
rasm_new_label
int rasm_new_label(RasmContext *rctx, const char *name)
Allocate a new label ID with the given name.
Definition: rasm.c:282
SwsAArch64Context::tmp0
RasmOp tmp0
Definition: ops_asmgen.c:148
a64op_sp
static RasmOp a64op_sp(void)
Definition: rasm.h:356
SwsAArch64Context::vt
RasmOp vt[12]
Definition: ops_asmgen.c:160
AARCH64_SWS_OP_WRITE_PLANAR
@ AARCH64_SWS_OP_WRITE_PLANAR
Definition: ops_impl.h:47
i_uxtl
#define i_uxtl(rctx, op0, op1)
Definition: rasm.h:593
SwsAArch64Context::use_vh
bool use_vh
Definition: ops_asmgen.c:172
LOOP_MASK
#define LOOP_MASK(p, idx)
Definition: ops_impl.h:119
AARCH64_SWS_OP_LSHIFT
@ AARCH64_SWS_OP_LSHIFT
Definition: ops_impl.h:52
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
i_ldrb
#define i_ldrb(rctx, op0, op1)
Definition: rasm.h:564
i_shl
#define i_shl(rctx, op0, op1, op2)
Definition: rasm.h:574
i_fmax
#define i_fmax(rctx, op0, op1, op2)
Definition: rasm.h:552
size
int size
Definition: twinvq_data.h:10344
AArch64VecViews::h
RasmOp h
Definition: rasm.h:458
SwsAArch64Context::load_cont_node
RasmNode * load_cont_node
Definition: ops_asmgen.c:155
i_zip2
#define i_zip2(rctx, op0, op1, op2)
Definition: rasm.h:597
SwsAArch64OpImplParams::op
SwsAArch64OpType op
Definition: ops_impl.h:93
SwsAArch64Context::vec_size
size_t vec_size
Definition: ops_asmgen.c:171
i_fcvtzu
#define i_fcvtzu(rctx, op0, op1)
Definition: rasm.h:551
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:166
asmgen_op_read_packed_n
static void asmgen_op_read_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:452
i_ucvtf
#define i_ucvtf(rctx, op0, op1)
Definition: rasm.h:585
AARCH64_SWS_OP_WRITE_BIT
@ AARCH64_SWS_OP_WRITE_BIT
Definition: ops_impl.h:44
a64op_off
static RasmOp a64op_off(RasmOp op, int16_t imm)
Definition: rasm.h:499
AARCH64_SWS_OP_READ_PLANAR
@ AARCH64_SWS_OP_READ_PLANAR
Definition: ops_impl.h:43
av_freep
static void av_freep(void *ptr)
Definition: ops_asmgen.c:52
AARCH64_SWS_OP_EXPAND
@ AARCH64_SWS_OP_EXPAND
Definition: ops_impl.h:56
i_uxtl2
#define i_uxtl2(rctx, op0, op1)
Definition: rasm.h:594
AArch64VecViews::de
RasmOp de[2]
Definition: rasm.h:472
SwsAArch64Context::tmp1
RasmOp tmp1
Definition: ops_asmgen.c:149
AARCH64_SWS_OP_UNPACK
@ AARCH64_SWS_OP_UNPACK
Definition: ops_impl.h:50
vv_4
static RasmOp vv_4(RasmOp op0, RasmOp op1, RasmOp op2, RasmOp op3)
Definition: rasm.h:448
i_lsr
#define i_lsr(rctx, op0, op1, op2)
Definition: rasm.h:566
rasm_annotate_next
void rasm_annotate_next(RasmContext *rctx, const char *comment)
Definition: rasm.c:263
clobbered_frame_size
static unsigned clobbered_frame_size(unsigned n)
Definition: ops_asmgen.c:215
RasmContext
Definition: rasm.h:184
lookup
int lookup
Definition: vorbis_enc_data.h:428
i_ldp
#define i_ldp(rctx, op0, op1, op2)
Definition: rasm.h:562
delta
float delta
Definition: vorbis_enc_data.h:430
asmgen_op_swap_bytes
static void asmgen_op_swap_bytes(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:601
AArch64VecViews::q
RasmOp q
Definition: rasm.h:461
offsetof_impl_priv
#define offsetof_impl_priv
Definition: ops_impl.h:174
asmgen_set_load_cont_node
static void asmgen_set_load_cont_node(SwsAArch64Context *s)
Set node where the continuation address will be loaded and impl will be incremented.
Definition: ops_asmgen.c:376
asmgen_op_write_packed
static void asmgen_op_write_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:568
AArch64VecViews::h4
RasmOp h4
Definition: rasm.h:465
i_xtn
#define i_xtn(rctx, op0, op1)
Definition: rasm.h:595
SwsAArch64Context::in
RasmOp in[4]
Definition: ops_asmgen.c:163
av_assert0
#define av_assert0(cond)
Definition: ops_asmgen.c:43
a64op_pre
static RasmOp a64op_pre(RasmOp op, int16_t imm)
Definition: rasm.h:500
SWIZZLE_TMP
#define SWIZZLE_TMP
Definition: ops_asmgen.c:628
asmgen_op_lshift
static void asmgen_op_lshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:821
SwsAArch64Context::el_count
size_t el_count
Definition: ops_asmgen.c:170
rasm_annotate_nextf
void rasm_annotate_nextf(RasmContext *rctx, char *s, size_t n, const char *fmt,...)
Definition: rasm.c:273
rasm_op_label
static RasmOp rasm_op_label(int id)
Definition: rasm.h:95
i_umax
#define i_umax(rctx, op0, op1, op2)
Definition: rasm.h:586
ret
ret
Definition: filter_design.txt:187
offsetof_exec_in
#define offsetof_exec_in
These values will be used by ops_asmgen to access fields inside of SwsOpExec and SwsOpImpl.
Definition: ops_impl.h:169
MASK_SET
#define MASK_SET(mask, idx, val)
Definition: ops_impl.h:110
rasm_func_begin
int rasm_func_begin(RasmContext *rctx, const char *name, bool export, bool jumpable)
Definition: rasm.c:209
i_mov16b
#define i_mov16b(rctx, op0, op1)
Definition: rasm.h:618
AARCH64_PIXEL_U8
@ AARCH64_PIXEL_U8
Definition: ops_impl.h:30
SwsAArch64Context::cont
RasmOp cont
Definition: ops_asmgen.c:154
i_b
#define i_b(rctx, op0)
Definition: rasm.h:543
lookup_gen
static int lookup_gen(void)
Definition: ops_asmgen.c:1468
SwsAArch64Context::in_bump
RasmOp in_bump[4]
Definition: ops_asmgen.c:165
AARCH64_PIXEL_U32
@ AARCH64_PIXEL_U32
Definition: ops_impl.h:32
MASK_GET
#define MASK_GET(mask, idx)
Definition: ops_impl.h:109
i_str
#define i_str(rctx, op0, op1)
Definition: rasm.h:580
dynarray.h
swizzle_emit
static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
Definition: ops_asmgen.c:647
i_and
#define i_and(rctx, op0, op1, op2)
Definition: rasm.h:542
i_ldrh
#define i_ldrh(rctx, op0, op1)
Definition: rasm.h:565
SwsAArch64Context::op0_func
RasmOp op0_func
Definition: ops_asmgen.c:152
asmgen_prologue
static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:220
asmgen_op_expand
static void asmgen_op_expand(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:951
i_ubfiz
#define i_ubfiz(rctx, op0, op1, op2, op3)
Definition: rasm.h:584
asmgen_epilogue
static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:241
Windows::Graphics::DirectX::Direct3D11::p
IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p
Definition: vsrc_gfxcapture_winrt.hpp:53
i_orr
#define i_orr(rctx, op0, op1, op2)
Definition: rasm.h:570
AARCH64_SWS_OP_WRITE_PACKED
@ AARCH64_SWS_OP_WRITE_PACKED
Definition: ops_impl.h:46
SwsAArch64OpImplParams
SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType operation.
Definition: ops_impl.h:92
i_rev32
#define i_rev32(rctx, op0, op1)
Definition: rasm.h:573
i_stp
#define i_stp(rctx, op0, op1, op2)
Definition: rasm.h:579
asmgen_op_min
static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:983
i_movi
#define i_movi(rctx, op0, op1)
Definition: rasm.h:568
i_ret
#define i_ret(rctx)
Definition: rasm.h:571
AARCH64_PIXEL_U16
@ AARCH64_PIXEL_U16
Definition: ops_impl.h:31
a64op_x
static RasmOp a64op_x(RasmOp op)
Definition: rasm.h:360
asmgen_op_linear
static void asmgen_op_linear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1148
offsetof_exec_in_bump
#define offsetof_exec_in_bump
Definition: ops_impl.h:171
SwsAArch64Context::bx
RasmOp bx
Definition: ops_asmgen.c:144
asmgen_op_rshift
static void asmgen_op_rshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:835
linear_index_is_offset
static int linear_index_is_offset(int idx)
Definition: ops_impl.h:150
linear_num_vregs
static int linear_num_vregs(const SwsAArch64OpImplParams *params)
Definition: ops_impl.h:136
SwsAArch64PixelType
SwsAArch64PixelType
Definition: ops_impl.h:29
print_swizzle_v
static const char * print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:630
width
#define width
Definition: dsp.h:89
asmgen_op_swizzle
static void asmgen_op_swizzle(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:659
SwsAArch64Context::y_end
RasmOp y_end
Definition: ops_asmgen.c:141
snprintf
#define snprintf
Definition: snprintf.h:34
i_ushl
#define i_ushl(rctx, op0, op1, op2)
Definition: rasm.h:589
rasm_add_comment
RasmNode * rasm_add_comment(RasmContext *rctx, const char *comment)
Definition: rasm.c:117
src
#define src
Definition: vp8dsp.c:248
a64op_lr
static RasmOp a64op_lr(void)
Definition: rasm.h:355
a64op_gpr_n
static uint8_t a64op_gpr_n(RasmOp op)
Definition: rasm.h:350
i_mov
#define i_mov(rctx, op0, op1)
Definition: rasm.h:567
v_q
static RasmOp v_q(RasmOp op)
Definition: rasm.h:433
AArch64VecViews::b8
RasmOp b8
Definition: rasm.h:463