FFmpeg
ops_asmgen.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2026 Ramiro Polla
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <assert.h>
22 #include <limits.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #ifdef _WIN32
29 #include <io.h>
30 #include <fcntl.h>
31 #endif
32 
33 /**
34  * This file is compiled as a standalone build-time tool and must not depend
35  * on internal FFmpeg libraries. The necessary utils are redefined below using
36  * standard C equivalents.
37  */
38 
39 #define AVUTIL_AVASSERT_H
40 #define AVUTIL_LOG_H
41 #define AVUTIL_MACROS_H
42 #define AVUTIL_MEM_H
43 #define av_assert0(cond) assert(cond)
44 #define av_malloc(s) malloc(s)
45 #define av_mallocz(s) calloc(1, s)
46 #define av_realloc(p, s) realloc(p, s)
47 #define av_strdup(s) strdup(s)
48 #define av_free(p) free(p)
49 #define FFMAX(a,b) ((a) > (b) ? (a) : (b))
50 #define FFMIN(a,b) ((a) > (b) ? (b) : (a))
51 
52 static void av_freep(void *ptr)
53 {
54  void **pptr = (void **) ptr;
55  if (pptr) {
56  ptr = *pptr;
57  if (ptr)
58  free(ptr);
59  *pptr = NULL;
60  }
61 }
62 
63 #include "libavutil/dynarray.h"
64 
65 static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
66  const uint8_t *elem_data)
67 {
68  uint8_t *tab_elem_data = NULL;
69 
70  FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, {
71  tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size;
72  if (elem_data)
73  memcpy(tab_elem_data, elem_data, elem_size);
74  }, {
75  av_freep(tab_ptr);
76  *nb_ptr = 0;
77  });
78  return tab_elem_data;
79 }
80 
81 /*********************************************************************/
82 #include "rasm.c"
83 #include "rasm_print.c"
84 #include "ops_impl.c"
85 
86 /**
87  * Implementation parameters for all exported functions. This list is
88  * compiled by performing a dummy run of all conversions in sws_ops and
89  * collecting all functions that need to be generated. This is achieved
90  * by running:
91  * make sws_ops_entries_aarch64
92  */
94 #include "ops_entries.c"
95  { .op = AARCH64_SWS_OP_NONE }
96 };
97 
98 /*********************************************************************/
100 {
101  switch (fmt) {
102  case AARCH64_PIXEL_U8: return 1;
103  case AARCH64_PIXEL_U16: return 2;
104  case AARCH64_PIXEL_U32: return 4;
105  case AARCH64_PIXEL_F32: return 4;
106  default:
107  av_assert0(!"Invalid pixel type!");
108  break;
109  }
110  return 0;
111 }
112 
113 static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
114 {
115  buf_appendf(buf, size, "ff_sws");
116  const ParamField **fields = op_fields[params->op];
117  for (int i = 0; fields[i]; i++) {
118  const ParamField *field = fields[i];
119  void *p = (void *) (((uintptr_t) params) + field->offset);
120  field->print_str(buf, size, p);
121  }
122  buf_appendf(buf, size, "_neon");
123 }
124 
125 void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
126 {
127  impl_func_name(&buf, &size, params);
128  av_assert0(size && "string buffer exhausted");
129 }
130 
131 /*********************************************************************/
132 typedef struct SwsAArch64Context {
134 
135  /* SwsOpFunc arguments. */
142 
143  /* Loop iterator variables. */
146 
147  /* Scratch registers. */
150 
151  /* CPS-related variables. */
155 
156  /* Vector registers. Two banks (low and high) are used. */
157  RasmOp vl[ 4];
158  RasmOp vh[ 4];
159  RasmOp vt[12];
160 
161  /* Read/Write data pointers and padding. */
162  RasmOp in[4];
166 
167  /* Vector register dimensions. */
168  size_t el_size;
169  size_t el_count;
170  size_t vec_size;
171  bool use_vh;
173 
174 /*********************************************************************/
175 /* Helpers functions. */
176 
177 /* Looping when s->use_vh is set. */
178 #define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx)
179 #define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx)
180 #define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx)
181 
182 /* Inline rasm comments. */
183 #define CMT(comment) rasm_annotate(r, comment)
184 #define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__)
185 
186 /* Reshape all vector registers for current SwsOp. */
187 static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
188 {
189  s->vl[ 0] = a64op_make_vec( 0, el_count, el_size);
190  s->vl[ 1] = a64op_make_vec( 1, el_count, el_size);
191  s->vl[ 2] = a64op_make_vec( 2, el_count, el_size);
192  s->vl[ 3] = a64op_make_vec( 3, el_count, el_size);
193  s->vh[ 0] = a64op_make_vec( 4, el_count, el_size);
194  s->vh[ 1] = a64op_make_vec( 5, el_count, el_size);
195  s->vh[ 2] = a64op_make_vec( 6, el_count, el_size);
196  s->vh[ 3] = a64op_make_vec( 7, el_count, el_size);
197  s->vt[ 0] = a64op_make_vec(16, el_count, el_size);
198  s->vt[ 1] = a64op_make_vec(17, el_count, el_size);
199  s->vt[ 2] = a64op_make_vec(18, el_count, el_size);
200  s->vt[ 3] = a64op_make_vec(19, el_count, el_size);
201  s->vt[ 4] = a64op_make_vec(20, el_count, el_size);
202  s->vt[ 5] = a64op_make_vec(21, el_count, el_size);
203  s->vt[ 6] = a64op_make_vec(22, el_count, el_size);
204  s->vt[ 7] = a64op_make_vec(23, el_count, el_size);
205  s->vt[ 8] = a64op_make_vec(24, el_count, el_size);
206  s->vt[ 9] = a64op_make_vec(25, el_count, el_size);
207  s->vt[10] = a64op_make_vec(26, el_count, el_size);
208  s->vt[11] = a64op_make_vec(27, el_count, el_size);
209 }
210 
211 /*********************************************************************/
212 /* Function frame */
213 
214 static unsigned clobbered_frame_size(unsigned n)
215 {
216  return ((n + 1) >> 1) * 16;
217 }
218 
219 static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
220 {
221  RasmContext *r = s->rctx;
222  RasmOp sp = a64op_sp();
223  unsigned frame_size = clobbered_frame_size(n);
224  RasmOp sp_pre = a64op_pre(sp, -frame_size);
225 
226  rasm_add_comment(r, "prologue");
227  if (n == 0) {
228  /* no-op */
229  } else if (n == 1) {
230  i_str(r, regs[0], sp_pre);
231  } else {
232  i_stp(r, regs[0], regs[1], sp_pre);
233  for (unsigned i = 2; i + 1 < n; i += 2)
234  i_stp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
235  if (n & 1)
236  i_str(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
237  }
238 }
239 
240 static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
241 {
242  RasmContext *r = s->rctx;
243  RasmOp sp = a64op_sp();
244  unsigned frame_size = clobbered_frame_size(n);
245  RasmOp sp_post = a64op_post(sp, frame_size);
246 
247  rasm_add_comment(r, "epilogue");
248  if (n == 0) {
249  /* no-op */
250  } else if (n == 1) {
251  i_ldr(r, regs[0], sp_post);
252  } else {
253  if (n & 1)
254  i_ldr(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
255  for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2)
256  i_ldp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
257  i_ldp(r, regs[0], regs[1], sp_post);
258  }
259 }
260 
261 /*********************************************************************/
262 /* Callee-saved registers (r19-r28). */
263 #define MAX_SAVED_REGS 10
264 
265 static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
266  RasmOp gpr)
267 {
268  const int n = a64op_gpr_n(gpr);
269  if (n >= 19 && n <= 28)
270  regs[(*count)++] = gpr;
271 }
272 
273 static unsigned clobbered_gprs(const SwsAArch64Context *s,
274  const SwsAArch64OpImplParams *p,
275  RasmOp regs[MAX_SAVED_REGS])
276 {
277  unsigned count = 0;
278  LOOP_MASK(p, i) {
279  clobber_gpr(regs, &count, s->in[i]);
280  clobber_gpr(regs, &count, s->out[i]);
281  clobber_gpr(regs, &count, s->in_bump[i]);
282  clobber_gpr(regs, &count, s->out_bump[i]);
283  }
284  return count;
285 }
286 
288 {
289  RasmContext *r = s->rctx;
290  char func_name[128];
291  char buf[64];
292 
293  /**
294  * The process/process_return functions for aarch64 work similarly
295  * to the x86 backend. The description in x86/ops_common.asm mostly
296  * holds as well here.
297  */
298 
299  aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
300 
301  rasm_func_begin(r, func_name, true, false);
302 
303  /* Function prologue */
304  RasmOp saved_regs[MAX_SAVED_REGS];
305  unsigned nsaved = clobbered_gprs(s, p, saved_regs);
306  if (nsaved)
307  asmgen_prologue(s, saved_regs, nsaved);
308 
309  /* Load values from impl. */
310  i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr op0_func = impl->cont;");
311  i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl)); CMT("SwsOpImpl *op1_impl = impl + 1;");
312 
313  /* Load values from exec. */
314  LOOP_MASK(p, i) {
315  rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, i);
316  i_ldr(r, s->in[i], a64op_off(s->exec, offsetof_exec_in + (i * sizeof(uint8_t *))));
317  }
318  LOOP_MASK(p, i) {
319  rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", i, i);
320  i_ldr(r, s->out[i], a64op_off(s->exec, offsetof_exec_out + (i * sizeof(uint8_t *))));
321  }
322  LOOP_MASK(p, i) {
323  rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = exec->in_bump[%u];", i, i);
324  i_ldr(r, s->in_bump[i], a64op_off(s->exec, offsetof_exec_in_bump + (i * sizeof(ptrdiff_t))));
325  }
326  LOOP_MASK(p, i) {
327  rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = exec->out_bump[%u];", i, i);
328  i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
329  }
330 
331  /* Reset x and jump to first kernel. */
332  i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
333  i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
334  i_br (r, s->op0_func); CMT("jump to op0_func");
335 }
336 
338 {
339  RasmContext *r = s->rctx;
340  char func_name[128];
341 
342  aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
343 
344  rasm_func_begin(r, func_name, true, true);
345 
346  /* Reset impl to first kernel. */
347  i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
348 
349  /* Perform horizontal loop. */
350  int loop = rasm_new_label(r, NULL);
351  i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
352  i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
353  i_bne(r, loop); CMT(" goto loop;");
354 
355  /* Perform vertical loop. */
356  int end = rasm_new_label(r, NULL);
357  i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
358  i_cmp(r, s->y, s->y_end); CMT("if (y == y_end)");
359  i_beq(r, end); CMT(" goto end;");
360 
361  /* Perform padding and reset x, preparing for next row. */
362  LOOP_MASK(p, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
363  LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
364  i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
365 
366  /* Loop back or end of function. */
367  rasm_add_label(r, loop); CMT("loop:");
368  i_br (r, s->op0_func); CMT("jump to op0_func");
369  rasm_add_label(r, end); CMT("end:");
370 
371  /* Function epilogue */
372  RasmOp saved_regs[MAX_SAVED_REGS];
373  unsigned nsaved = clobbered_gprs(s, p, saved_regs);
374  if (nsaved)
375  asmgen_epilogue(s, saved_regs, nsaved);
376 
377  i_ret(r);
378 }
379 
380 /*********************************************************************/
381 /* gather raw pixels from planes */
382 /* AARCH64_SWS_OP_READ_BIT */
383 /* AARCH64_SWS_OP_READ_NIBBLE */
384 /* AARCH64_SWS_OP_READ_PACKED */
385 /* AARCH64_SWS_OP_READ_PLANAR */
386 
388 {
389  RasmContext *r = s->rctx;
390  RasmOp bitmask_vec = s->vt[1];
391  RasmOp wtmp = a64op_w(s->tmp0);
392  AArch64VecViews vl[1];
393  AArch64VecViews vtmp;
394  AArch64VecViews shift_vec;
395 
396  a64op_vec_views(s->vt[0], &shift_vec);
397  a64op_vec_views(s->vl[0], &vl[0]);
398  a64op_vec_views(s->vt[2], &vtmp);
399 
400  /* Note that shift_vec has negative values, so that using it with
401  * ushl actually performs a right shift. */
402  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
403  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
404 
405  if (p->block_size == 16) {
406  i_ldrh(r, wtmp, a64op_post(s->in[0], 2)); CMT("uint16_t tmp = *in[0]++;");
407  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 16 times>};");
408  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
409  i_lsr (r, wtmp, wtmp, IMM(8)); CMT("tmp >>= 8;");
410  i_dup (r, vtmp.b8, wtmp); CMT("vtmp.lo = broadcast(tmp);");
411  i_ins (r, vl[0].de[1], vtmp.de[0]); CMT("vl[0].hi = vtmp.lo;");
412  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
413  i_and (r, vl[0].b16, vl[0].b16, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
414  } else {
415  i_ldrb(r, wtmp, a64op_post(s->in[0], 1)); CMT("uint8_t tmp = *in[0]++;");
416  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};");
417  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
418  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
419  i_and (r, vl[0].b8, vl[0].b8, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
420  }
421 }
422 
424 {
425  RasmContext *r = s->rctx;
426  RasmOp nibble_mask = v_8b(s->vt[0]);
427  AArch64VecViews vl[1];
428  AArch64VecViews vtmp;
429 
430  a64op_vec_views(s->vl[0], &vl[0]);
431  a64op_vec_views(s->vt[1], &vtmp);
432 
433  rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 <repeats 8 times>};");
434  i_movi(r, nibble_mask, IMM(0x0f));
435 
436  if (p->block_size == 8) {
437  i_ldr (r, vl[0].s, a64op_post(s->in[0], 4)); CMT("vl[0] = *in[0]++;");
438  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
439  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
440  i_zip1(r, vl[0].b8, vtmp.b8, vl[0].b8); CMT("interleave");
441  } else {
442  i_ldr (r, vl[0].d, a64op_post(s->in[0], 8)); CMT("vl[0] = *in[0]++;");
443  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
444  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
445  i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16); CMT("interleave");
446  }
447 }
448 
450 {
451  RasmContext *r = s->rctx;
452  AArch64VecViews vl[1];
453  AArch64VecViews vh[1];
454 
455  a64op_vec_views(s->vl[0], &vl[0]);
456  a64op_vec_views(s->vh[0], &vh[0]);
457 
458  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
459  case 0x008: i_ldr(r, vl[0].d, a64op_post(s->in[0], s->vec_size * 1)); break;
460  case 0x010: i_ldr(r, vl[0].q, a64op_post(s->in[0], s->vec_size * 1)); break;
461  case 0x108: i_ldp(r, vl[0].d, vh[0].d, a64op_post(s->in[0], s->vec_size * 2)); break;
462  case 0x110: i_ldp(r, vl[0].q, vh[0].q, a64op_post(s->in[0], s->vec_size * 2)); break;
463  }
464 }
465 
467 {
468  RasmContext *r = s->rctx;
469 
470  switch (p->mask) {
471  case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]), a64op_post(s->in[0], s->vec_size * 2)); break;
472  case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->in[0], s->vec_size * 3)); break;
473  case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->in[0], s->vec_size * 4)); break;
474  }
475 }
476 
478 {
479  if (p->mask == 0x0001) {
481  } else {
482  asmgen_op_read_packed_n(s, p, s->vl);
483  if (s->use_vh)
484  asmgen_op_read_packed_n(s, p, s->vh);
485  }
486 }
487 
489 {
490  RasmContext *r = s->rctx;
491  AArch64VecViews vl[4];
492  AArch64VecViews vh[4];
493 
494  for (int i = 0; i < 4; i++) {
495  a64op_vec_views(s->vl[i], &vl[i]);
496  a64op_vec_views(s->vh[i], &vh[i]);
497  }
498 
499  LOOP_MASK(p, i) {
500  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
501  case 0x008: i_ldr(r, vl[i].d, a64op_post(s->in[i], s->vec_size * 1)); break;
502  case 0x010: i_ldr(r, vl[i].q, a64op_post(s->in[i], s->vec_size * 1)); break;
503  case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], s->vec_size * 2)); break;
504  case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], s->vec_size * 2)); break;
505  }
506  }
507 }
508 
509 /*********************************************************************/
510 /* write raw pixels to planes */
511 /* AARCH64_SWS_OP_WRITE_BIT */
512 /* AARCH64_SWS_OP_WRITE_NIBBLE */
513 /* AARCH64_SWS_OP_WRITE_PACKED */
514 /* AARCH64_SWS_OP_WRITE_PLANAR */
515 
517 {
518  RasmContext *r = s->rctx;
519  AArch64VecViews vl[1];
520  AArch64VecViews shift_vec;
521  AArch64VecViews vtmp0;
522  AArch64VecViews vtmp1;
523 
524  a64op_vec_views(s->vl[0], &vl[0]);
525  a64op_vec_views(s->vt[0], &shift_vec);
526  a64op_vec_views(s->vt[1], &vtmp0);
527  a64op_vec_views(s->vt[2], &vtmp1);
528 
529  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
530  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
531 
532  if (p->block_size == 8) {
533  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
534  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
535  i_str (r, vtmp0.b, a64op_post(s->out[0], 1)); CMT("*out[0]++ = vtmp0;");
536  } else {
537  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
538  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
539  i_ins (r, vtmp1.de[0], vl[0].de[1]); CMT("vtmp1.lo = vl[0].hi;");
540  i_addv(r, vtmp1.b, vtmp1.b8); CMT("vtmp1[0] = add_across(vtmp1);");
541  i_ins (r, vtmp0.be[1], vtmp1.be[0]); CMT("vtmp0[1] = vtmp1[0];");
542  i_str (r, vtmp0.h, a64op_post(s->out[0], 2)); CMT("*out[0]++ = vtmp0;");
543  }
544 }
545 
547 {
548  RasmContext *r = s->rctx;
549  AArch64VecViews vl[4];
550  AArch64VecViews vtmp0;
551  AArch64VecViews vtmp1;
552 
553  for (int i = 0; i < 4; i++)
554  a64op_vec_views(s->vl[i], &vl[i]);
555  a64op_vec_views(s->vt[0], &vtmp0);
556  a64op_vec_views(s->vt[1], &vtmp1);
557 
558  if (p->block_size == 8) {
559  i_shl (r, vtmp0.h4, vl[0].h4, IMM(4));
560  i_ushr(r, vtmp1.h4, vl[0].h4, IMM(8));
561  i_orr (r, vl[0].b8, vtmp0.b8, vtmp1.b8);
562  i_xtn (r, vtmp0.b8, vl[0].h8);
563  i_str (r, vtmp0.s, a64op_post(s->out[0], 4));
564  } else {
565  i_shl (r, vtmp0.h8, vl[0].h8, IMM(4));
566  i_ushr(r, vtmp1.h8, vl[0].h8, IMM(8));
567  i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16);
568  i_xtn (r, vtmp0.b8, vl[0].h8);
569  i_str (r, vtmp0.d, a64op_post(s->out[0], 8));
570  }
571 }
572 
574 {
575  RasmContext *r = s->rctx;
576  AArch64VecViews vl[1];
577  AArch64VecViews vh[1];
578 
579  a64op_vec_views(s->vl[0], &vl[0]);
580  a64op_vec_views(s->vh[0], &vh[0]);
581 
582  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
583  case 0x008: i_str(r, vl[0].d, a64op_post(s->out[0], s->vec_size * 1)); break;
584  case 0x010: i_str(r, vl[0].q, a64op_post(s->out[0], s->vec_size * 1)); break;
585  case 0x108: i_stp(r, vl[0].d, vh[0].d, a64op_post(s->out[0], s->vec_size * 2)); break;
586  case 0x110: i_stp(r, vl[0].q, vh[0].q, a64op_post(s->out[0], s->vec_size * 2)); break;
587  }
588 }
589 
591 {
592  RasmContext *r = s->rctx;
593 
594  switch (p->mask) {
595  case 0x0011: i_st2(r, vv_2(vx[0], vx[1]), a64op_post(s->out[0], s->vec_size * 2)); break;
596  case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->out[0], s->vec_size * 3)); break;
597  case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->out[0], s->vec_size * 4)); break;
598  }
599 }
600 
602 {
603  if (p->mask == 0x0001) {
605  } else {
606  asmgen_op_write_packed_n(s, p, s->vl);
607  if (s->use_vh)
608  asmgen_op_write_packed_n(s, p, s->vh);
609  }
610 }
611 
613 {
614  RasmContext *r = s->rctx;
615  AArch64VecViews vl[4];
616  AArch64VecViews vh[4];
617 
618  for (int i = 0; i < 4; i++) {
619  a64op_vec_views(s->vl[i], &vl[i]);
620  a64op_vec_views(s->vh[i], &vh[i]);
621  }
622 
623  LOOP_MASK(p, i) {
624  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
625  case 0x008: i_str(r, vl[i].d, a64op_post(s->out[i], s->vec_size * 1)); break;
626  case 0x010: i_str(r, vl[i].q, a64op_post(s->out[i], s->vec_size * 1)); break;
627  case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], s->vec_size * 2)); break;
628  case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], s->vec_size * 2)); break;
629  }
630  }
631 }
632 
633 /*********************************************************************/
634 /* swap byte order (for differing endianness) */
635 /* AARCH64_SWS_OP_SWAP_BYTES */
636 
638 {
639  RasmContext *r = s->rctx;
640  AArch64VecViews vl[4];
641  AArch64VecViews vh[4];
642 
643  for (int i = 0; i < 4; i++) {
644  a64op_vec_views(s->vl[i], &vl[i]);
645  a64op_vec_views(s->vh[i], &vh[i]);
646  }
647 
648  switch (aarch64_pixel_size(p->type)) {
649  case sizeof(uint16_t):
650  LOOP_MASK (p, i) i_rev16(r, vl[i].b16, vl[i].b16);
651  LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16);
652  break;
653  case sizeof(uint32_t):
654  LOOP_MASK (p, i) i_rev32(r, vl[i].b16, vl[i].b16);
655  LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16);
656  break;
657  }
658 }
659 
660 /*********************************************************************/
661 /* rearrange channel order, or duplicate channels */
662 /* AARCH64_SWS_OP_SWIZZLE */
663 
664 #define SWIZZLE_TMP 0xf
665 
666 static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
667 {
668  if (n == SWIZZLE_TMP)
669  snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l');
670  else
671  snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n);
672  return buf;
673 }
674 #define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh)
675 
676 static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
677 {
678  if (n == SWIZZLE_TMP)
679  return s->vt[vh];
680  return vh ? s->vh[n] : s->vl[n];
681 }
682 
683 static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
684 {
685  RasmContext *r = s->rctx;
686  RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) };
687  RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) };
688 
689  i_mov (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 0), PRINT_SWIZZLE_V(src, 0));
690  if (s->use_vh) {
691  i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 1), PRINT_SWIZZLE_V(src, 1));
692  }
693 }
694 
696 {
697  /* Compute used vectors (src and dst) */
698  uint8_t src_used[4] = { 0 };
699  bool done[4] = { true, true, true, true };
700  LOOP_MASK(p, dst) {
701  uint8_t src = MASK_GET(p->swizzle, dst);
702  src_used[src]++;
703  done[dst] = false;
704  }
705 
706  /* First perform unobstructed copies. */
707  for (bool progress = true; progress; ) {
708  progress = false;
709  for (int dst = 0; dst < 4; dst++) {
710  if (done[dst] || src_used[dst])
711  continue;
712  uint8_t src = MASK_GET(p->swizzle, dst);
713  swizzle_emit(s, dst, src);
714  src_used[src]--;
715  done[dst] = true;
716  progress = true;
717  }
718  }
719 
720  /* Then swap and rotate remaining operations. */
721  for (int dst = 0; dst < 4; dst++) {
722  if (done[dst])
723  continue;
724 
726 
727  uint8_t cur_dst = dst;
728  uint8_t src = MASK_GET(p->swizzle, cur_dst);
729  while (src != dst) {
730  swizzle_emit(s, cur_dst, src);
731  done[cur_dst] = true;
732  cur_dst = src;
733  src = MASK_GET(p->swizzle, cur_dst);
734  }
735 
736  swizzle_emit(s, cur_dst, SWIZZLE_TMP);
737  done[cur_dst] = true;
738  }
739 }
740 
741 #undef SWIZZLE_TMP
742 
743 /*********************************************************************/
744 /* split tightly packed data into components */
745 /* AARCH64_SWS_OP_UNPACK */
746 
748 {
749  RasmContext *r = s->rctx;
750  RasmOp *vl = s->vl;
751  RasmOp *vh = s->vh;
752  RasmOp *vt = s->vt;
753  RasmOp mask_gpr = a64op_w(s->tmp0);
754  uint32_t mask_val[4] = { 0 };
755  uint8_t mask_idx[4] = { 0 };
756  uint8_t cur_vt = 0;
757 
758  const int offsets[4] = {
759  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
760  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
761  MASK_GET(p->pack, 3),
762  0
763  };
764 
765  /* Generate masks. */
766  rasm_add_comment(r, "generate masks");
767  LOOP_MASK(p, i) {
768  uint32_t val = (1u << MASK_GET(p->pack, i)) - 1;
769  for (int j = 0; j < 4; j++) {
770  if (mask_val[j] == val) {
771  mask_val[i] = mask_val[j];
772  mask_idx[i] = mask_idx[j];
773  break;
774  }
775  }
776  if (!mask_val[i]) {
777  /**
778  * All-one values in movi only work up to 8-bit, and then
779  * at full 16- or 32-bit, but not for intermediate values
780  * like 10-bit. In those cases, we use mov + dup instead.
781  */
782  if (val <= 0xff || val == 0xffff) {
783  i_movi(r, vt[cur_vt], IMM(val));
784  } else {
785  i_mov (r, mask_gpr, IMM(val));
786  i_dup (r, vt[cur_vt], mask_gpr);
787  }
788  mask_val[i] = val;
789  mask_idx[i] = cur_vt++;
790  }
791  }
792 
793  /* Loop backwards to avoid clobbering component 0. */
794  LOOP_MASK_BWD (p, i) {
795  if (offsets[i]) {
796  i_ushr (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", i, offsets[i]);
797  } else if (i) {
798  i_mov16b(r, vl[i], vl[0]); CMTF("vl[%u] = vl[0];", i);
799  }
800  }
801  LOOP_MASK_BWD_VH(s, p, i) {
802  if (offsets[i]) {
803  i_ushr (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", i, offsets[i]);
804  } else if (i) {
805  i_mov16b(r, vh[i], vh[0]); CMTF("vh[%u] = vh[0];", i);
806  }
807  }
808 
809  /* Apply masks. */
810  reshape_all_vectors(s, 16, 1);
811  LOOP_MASK_BWD (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); }
812  LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); }
813 }
814 
815 /*********************************************************************/
816 /* compress components into tightly packed data */
817 /* AARCH64_SWS_OP_PACK */
818 
820 {
821  RasmContext *r = s->rctx;
822  RasmOp *vl = s->vl;
823  RasmOp *vh = s->vh;
824 
825  const int offsets[4] = {
826  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
827  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
828  MASK_GET(p->pack, 3),
829  0
830  };
831  uint16_t offset_mask = 0;
832  LOOP_MASK(p, i) {
833  if (offsets[i])
834  MASK_SET(offset_mask, i, 1);
835  }
836 
837  /* Perform left shift. */
838  LOOP (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); CMTF("vl[%u] <<= %u;", i, offsets[i]); }
839  LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); CMTF("vh[%u] <<= %u;", i, offsets[i]); }
840 
841  /* Combine components. */
842  reshape_all_vectors(s, 16, 1);
843  LOOP_MASK (p, i) {
844  if (i != 0) {
845  i_orr (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i);
846  if (s->use_vh) {
847  i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i);
848  }
849  }
850  }
851 }
852 
853 /*********************************************************************/
854 /* logical left shift of raw pixel values by (u8) */
855 /* AARCH64_SWS_OP_LSHIFT */
856 
858 {
859  RasmContext *r = s->rctx;
860  RasmOp *vl = s->vl;
861  RasmOp *vh = s->vh;
862 
863  LOOP_MASK (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] <<= %u;", i, p->shift); }
864  LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] <<= %u;", i, p->shift); }
865 }
866 
867 /*********************************************************************/
868 /* right shift of raw pixel values by (u8) */
869 /* AARCH64_SWS_OP_RSHIFT */
870 
872 {
873  RasmContext *r = s->rctx;
874  RasmOp *vl = s->vl;
875  RasmOp *vh = s->vh;
876 
877  LOOP_MASK (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] >>= %u;", i, p->shift); }
878  LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] >>= %u;", i, p->shift); }
879 }
880 
881 /*********************************************************************/
882 /* clear pixel values */
883 /* AARCH64_SWS_OP_CLEAR */
884 
886 {
887  RasmContext *r = s->rctx;
888  RasmOp *vl = s->vl;
889  RasmOp *vh = s->vh;
890  RasmOp clear_vec = s->vt[0];
891 
892  /**
893  * TODO
894  * - pack elements in impl->priv and perform smaller loads
895  * - if only 1 element and not vh, load directly with ld1r
896  */
897 
898  i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 clear_vec = impl->priv.v128;");
899 
900  LOOP_MASK (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i)); CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
901  LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i)); CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
902 }
903 
904 /*********************************************************************/
905 /* convert (cast) between formats */
906 /* AARCH64_SWS_OP_CONVERT */
907 
909 {
910  RasmContext *r = s->rctx;
911  AArch64VecViews vl[4];
912  AArch64VecViews vh[4];
913 
914  /**
915  * Since each instruction in the convert operation needs specific
916  * element types, it is simpler to use arrangement specifiers for
917  * each operand instead of reshaping all vectors.
918  */
919 
920  for (int i = 0; i < 4; i++) {
921  a64op_vec_views(s->vl[i], &vl[i]);
922  a64op_vec_views(s->vh[i], &vh[i]);
923  }
924 
925  size_t src_el_size = s->el_size;
926  size_t dst_el_size = aarch64_pixel_size(p->to_type);
927 
928  /**
929  * This function assumes block_size is either 8 or 16, and that
930  * we're always using the most amount of vector registers possible.
931  * Therefore, u32 always uses the high vector bank.
932  */
933  if (p->type == AARCH64_PIXEL_F32) {
934  rasm_add_comment(r, "f32 -> u32");
935  LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4);
936  LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4);
937  }
938 
939  if (p->block_size == 8) {
940  if (src_el_size == 1 && dst_el_size > src_el_size) {
941  rasm_add_comment(r, "u8 -> u16");
942  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
943  src_el_size = 2;
944  } else if (src_el_size == 4 && dst_el_size < src_el_size) {
945  rasm_add_comment(r, "u32 -> u16");
946  LOOP_MASK(p, i) i_xtn (r, vl[i].h4, vl[i].s4);
947  LOOP_MASK(p, i) i_xtn (r, vh[i].h4, vh[i].s4);
948  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
949  src_el_size = 2;
950  }
951  if (src_el_size == 2 && dst_el_size == 4) {
952  rasm_add_comment(r, "u16 -> u32");
953  LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4, vl[i].h8);
954  LOOP_MASK(p, i) i_uxtl (r, vl[i].s4, vl[i].h4);
955  src_el_size = 4;
956  } else if (src_el_size == 2 && dst_el_size == 1) {
957  rasm_add_comment(r, "u16 -> u8");
958  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
959  src_el_size = 1;
960  }
961  } else /* if (p->block_size == 16) */ {
962  if (src_el_size == 1 && dst_el_size == 2) {
963  rasm_add_comment(r, "u8 -> u16");
964  LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8, vl[i].b16);
965  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
966  } else if (src_el_size == 2 && dst_el_size == 1) {
967  rasm_add_comment(r, "u16 -> u8");
968  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
969  LOOP_MASK(p, i) i_xtn (r, vh[i].b8, vh[i].h8);
970  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
971  }
972  }
973 
974  /* See comment above for high vector bank usage for u32. */
975  if (p->to_type == AARCH64_PIXEL_F32) {
976  rasm_add_comment(r, "u32 -> f32");
977  LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4);
978  LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4);
979  }
980 }
981 
982 /*********************************************************************/
983 /* expand integers to the full range */
984 /* AARCH64_SWS_OP_EXPAND */
985 
987 {
988  RasmContext *r = s->rctx;
989  RasmOp *vl = s->vl;
990  RasmOp *vh = s->vh;
991 
992  size_t src_el_size = s->el_size;
993  size_t dst_el_size = aarch64_pixel_size(p->to_type);
994  size_t dst_total_size = p->block_size * dst_el_size;
995  size_t dst_vec_size = FFMIN(dst_total_size, 16);
996 
997  if (!s->use_vh)
998  s->use_vh = (dst_vec_size != dst_total_size);
999 
1000  if (src_el_size == 1) {
1001  rasm_add_comment(r, "u8 -> u16");
1002  reshape_all_vectors(s, 16, 1);
1003  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
1004  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
1005  }
1006  if (dst_el_size == 4) {
1007  rasm_add_comment(r, "u16 -> u32");
1008  reshape_all_vectors(s, 8, 2);
1009  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
1010  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
1011  }
1012 }
1013 
1014 /*********************************************************************/
1015 /* numeric minimum (q4) */
1016 /* AARCH64_SWS_OP_MIN */
1017 
1019 {
1020  RasmContext *r = s->rctx;
1021  RasmOp *vl = s->vl;
1022  RasmOp *vh = s->vh;
1023  RasmOp *vt = s->vt;
1024  RasmOp min_vec = s->vt[4];
1025 
1026  i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 min_vec = impl->priv.v128;");
1027  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i)); CMTF("v128 vmin%u = min_vec[%u];", i, i); }
1028 
1029  if (p->type == AARCH64_PIXEL_F32) {
1030  LOOP_MASK (p, i) { i_fmin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
1031  LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
1032  } else {
1033  LOOP_MASK (p, i) { i_umin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
1034  LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
1035  }
1036 }
1037 
1038 /*********************************************************************/
1039 /* numeric maximum (q4) */
1040 /* AARCH64_SWS_OP_MAX */
1041 
1043 {
1044  RasmContext *r = s->rctx;
1045  RasmOp *vl = s->vl;
1046  RasmOp *vh = s->vh;
1047  RasmOp *vt = s->vt;
1048  RasmOp max_vec = s->vt[4];
1049 
1050  i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 max_vec = impl->priv.v128;");
1051  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i)); CMTF("v128 vmax%u = max_vec[%u];", i, i); }
1052 
1053  if (p->type == AARCH64_PIXEL_F32) {
1054  LOOP_MASK (p, i) { i_fmax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1055  LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1056  } else {
1057  LOOP_MASK (p, i) { i_umax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1058  LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1059  }
1060 }
1061 
1062 /*********************************************************************/
1063 /* multiplication by scalar (q) */
1064 /* AARCH64_SWS_OP_SCALE */
1065 
1067 {
1068  RasmContext *r = s->rctx;
1069  RasmOp *vl = s->vl;
1070  RasmOp *vh = s->vh;
1071  RasmOp priv_ptr = s->tmp0;
1072  RasmOp scale_vec = s->vt[0];
1073 
1074  i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv)); CMT("v128 *scale_vec_ptr = &impl->priv;");
1075  i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr)); CMT("v128 scale_vec = broadcast(*scale_vec_ptr);");
1076 
1077  if (p->type == AARCH64_PIXEL_F32) {
1078  LOOP_MASK (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1079  LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1080  } else {
1081  LOOP_MASK (p, i) { i_mul (r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1082  LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1083  }
1084 }
1085 
1086 /*********************************************************************/
1087 /* generalized linear affine transform */
1088 /* AARCH64_SWS_OP_LINEAR */
1089 
1090 /**
1091  * Performs one pass of the linear transform over a single vector bank
1092  * (low or high).
1093  */
1095  RasmOp *vt, RasmOp *vc,
1096  int save_mask, bool vh_pass)
1097 {
1098  RasmContext *r = s->rctx;
1099  /**
1100  * The intermediate registers for fmul+fadd (for when SWS_BITEXACT
1101  * is set) start from temp vector 4.
1102  */
1103  RasmOp *vtmp = &vt[4];
1104  RasmOp *vx = vh_pass ? s->vh : s->vl;
1105  char cvh = vh_pass ? 'h' : 'l';
1106 
1107  if (vh_pass && !s->use_vh)
1108  return;
1109 
1110  /**
1111  * Save rows that need to be used as input after they have been already
1112  * written to.
1113  */
1114  RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] };
1115  if (save_mask) {
1116  for (int i = 0; i < 4; i++) {
1117  if (MASK_GET(save_mask, i)) {
1118  src_vx[i] = vt[i];
1119  i_mov16b(r, vt[i], vx[i]); CMTF("vsrc[%u] = v%c[%u];", i, cvh, i);
1120  }
1121  }
1122  }
1123 
1124  /**
1125  * The non-zero coefficients have been packed in aarch64_setup_linear()
1126  * in sequential order into the individual lanes of the coefficient
1127  * vector registers. We must follow the same order of execution here.
1128  */
1129  int i_coeff = 0;
1130  LOOP_MASK(p, i) {
1131  bool first = true;
1132  RasmNode *pre_mul = rasm_get_current_node(r);
1133  for (int j = 0; j < 5; j++) {
1134  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1135  continue;
1136  bool is_offset = linear_index_is_offset(j);
1137  int src_j = linear_index_to_vx(j);
1138  RasmOp vsrc = src_vx[src_j];
1139  uint8_t vc_i = i_coeff / 4;
1140  uint8_t vc_j = i_coeff & 3;
1141  RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j);
1142  i_coeff++;
1143  if (first && is_offset) {
1144  i_dup (r, vx[i], vcoeff); CMTF("v%c[%u] = broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j);
1145  } else if (first && !is_offset) {
1146  if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) {
1147  i_mov16b(r, vx[i], vsrc); CMTF("v%c[%u] = vsrc[%u];", cvh, i, src_j);
1148  } else {
1149  i_fmul (r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] = vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1150  }
1151  } else if (!p->linear.fmla) {
1152  /**
1153  * Split the multiply-accumulate into fmul+fadd. All
1154  * multiplications are performed first into temporary
1155  * registers, and only then added to the destination,
1156  * to reduce the dependency chain.
1157  * There is no need to perform multiplications by 1.
1158  */
1159  if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) {
1160  pre_mul = rasm_set_current_node(r, pre_mul);
1161  i_fmul(r, vtmp[vc_j], vsrc, vcoeff); CMTF("vtmp[%u] = vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j);
1162  pre_mul = rasm_set_current_node(r, pre_mul);
1163  i_fadd(r, vx[i], vx[i], vtmp[vc_j]); CMTF("v%c[%u] += vtmp[%u];", cvh, i, vc_j);
1164  } else {
1165  i_fadd(r, vx[i], vx[i], vsrc); CMTF("v%c[%u] += vsrc[%u];", cvh, i, vc_j);
1166  }
1167  } else {
1168  /**
1169  * Most modern aarch64 cores have a fastpath for sequences
1170  * of fmla instructions. This means that even if the coefficient
1171  * is 1, it is still faster to use fmla by 1 instead of fadd.
1172  */
1173  i_fmla(r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] += vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1174  }
1175  first = false;
1176  }
1177  }
1178 }
1179 
1181 {
1182  RasmContext *r = s->rctx;
1183  RasmOp *vt = s->vt;
1184  RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp vector 8 */
1185  RasmOp ptr = s->tmp0;
1186  RasmOp coeff_veclist;
1187 
1188  /* Preload coefficients from impl->priv. */
1189  const int num_vregs = linear_num_vregs(p);
1190  av_assert0(num_vregs <= 4);
1191  switch (num_vregs) {
1192  case 1: coeff_veclist = vv_1(vc[0]); break;
1193  case 2: coeff_veclist = vv_2(vc[0], vc[1]); break;
1194  case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]); break;
1195  case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
1196  }
1197  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 *vcoeff_ptr = impl->priv.ptr;");
1198  i_ld1(r, coeff_veclist, a64op_base(ptr)); CMT("coeff_veclist = *vcoeff_ptr;");
1199 
1200  /* Compute mask for rows that must be saved before being overwritten. */
1201  uint16_t save_mask = 0;
1202  bool overwritten[4] = { false, false, false, false };
1203  LOOP_MASK(p, i) {
1204  for (int j = 0; j < 5; j++) {
1205  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1206  continue;
1207  bool is_offset = linear_index_is_offset(j);
1208  int src_j = linear_index_to_vx(j);
1209  if (!is_offset && overwritten[src_j])
1210  MASK_SET(save_mask, j - 1, 1);
1211  overwritten[i] = true;
1212  }
1213  }
1214 
1215  /* Perform linear passes for low and high vector banks. */
1216  linear_pass(s, p, vt, vc, save_mask, false);
1217  linear_pass(s, p, vt, vc, save_mask, true);
1218 }
1219 
1220 /*********************************************************************/
1221 /* add dithering noise */
1222 /* AARCH64_SWS_OP_DITHER */
1223 
1225 {
1226  RasmContext *r = s->rctx;
1227  RasmOp *vl = s->vl;
1228  RasmOp *vh = s->vh;
1229  RasmOp ptr = s->tmp0;
1230  RasmOp tmp1 = s->tmp1;
1231  RasmOp wtmp1 = a64op_w(tmp1);
1232  RasmOp dither_vl = s->vt[0];
1233  RasmOp dither_vh = s->vt[1];
1234  RasmOp bx64 = a64op_x(s->bx);
1235  RasmOp y64 = a64op_x(s->y);
1236 
1237  /**
1238  * For a description of the matrix buffer layout, read the comments
1239  * in aarch64_setup_dither() in aarch64/ops.c.
1240  */
1241 
1242  /**
1243  * Sort components by y_offset value so that we can start dithering
1244  * with the smallest value, and increment the pointer upwards for
1245  * each new offset. The dither matrix is over-allocated and may be
1246  * over-read at the top, but it cannot be over-read before the start
1247  * of the buffer. Since we only mask the y offset once, this would
1248  * be an issue if we tried to subtract a value larger than the
1249  * initial y_offset.
1250  */
1251  int sorted[4];
1252  int n_comps = 0;
1253  /* Very cheap bucket sort. */
1254  int max_offset = 0;
1255  LOOP_MASK(p, i)
1256  max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i));
1257  for (int y_off = 0; y_off <= max_offset; y_off++) {
1258  LOOP_MASK(p, i) {
1259  if (MASK_GET(p->dither.y_offset, i) == y_off)
1260  sorted[n_comps++] = i;
1261  }
1262  }
1263 
1264  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("void *ptr = impl->priv.ptr;");
1265 
1266  /**
1267  * We use ubfiz to mask and shift left in one single instruction:
1268  * ubfiz <Wd>, <Wn>, #<lsb>, #<width>
1269  * Wd = (Wn & ((1 << width) - 1)) << lsb;
1270  *
1271  * Given:
1272  * block_size = 8, log2(block_size) = 3
1273  * dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111
1274  * sizeof(float) = 4, log2(sizeof(float)) = 2
1275  *
1276  * Suppose we have bx = 0bvvvv. To get x, we left shift by
1277  * log2(block_size) and end up with 0bvvvv000. Then we mask against
1278  * dither_mask, and end up with 0bv000. Finally we multiply by
1279  * sizeof(float), which is the same as shifting left by
1280  * log2(sizeof(float)). The result is 0bv00000.
1281  *
1282  * Therefore:
1283  * width = log2(dither_size) - log2(block_size)
1284  * lsb = log2(block_size) + log2(sizeof(float))
1285  */
1286  const int block_size_log2 = (p->block_size == 16) ? 4 : 3;
1287  const int dither_size_log2 = p->dither.size_log2;
1288  const int sizeof_float_log2 = 2;
1289  if (dither_size_log2 != block_size_log2) {
1290  RasmOp lsb = IMM(block_size_log2 + sizeof_float_log2);
1291  RasmOp width = IMM(dither_size_log2 - block_size_log2);
1292  i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / block_size) - 1)) * block_size * sizeof(float);");
1293  i_add (r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1294  }
1295 
1296  int last_y_off = -1;
1297  int prev_i = 0;
1298  for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) {
1299  int i = sorted[sorted_i];
1300  uint8_t y_off = MASK_GET(p->dither.y_offset, i);
1301  bool do_load = (y_off != last_y_off);
1302 
1303  if (last_y_off < 0) {
1304  /* On the first run, calculate pointer inside dither_matrix. */
1305  RasmOp lsb = IMM(dither_size_log2 + sizeof_float_log2);
1306  RasmOp width = IMM(dither_size_log2);
1307  /**
1308  * The ubfiz instruction for the y offset performs masking
1309  * by the dither matrix size and shifts by the stride.
1310  */
1311  if (y_off == 0) {
1312  i_ubfiz(r, tmp1, y64, lsb, width); CMT("tmp1 = (y & (dither_size - 1)) * dither_size * sizeof(float);");
1313  } else {
1314  i_add (r, wtmp1, s->y, IMM(y_off)); CMTF("tmp1 = y + y_off[%u];", i);
1315  i_ubfiz(r, tmp1, tmp1, lsb, width); CMT("tmp1 = (tmp1 & (dither_size - 1)) * dither_size * sizeof(float);");
1316  }
1317  i_add(r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1318  } else if (do_load) {
1319  /**
1320  * On subsequent runs, just increment the pointer.
1321  * The matrix is over-allocated, so we don't risk
1322  * overreading.
1323  */
1324  int delta = (y_off - last_y_off) * (1 << dither_size_log2) * sizeof(float);
1325  i_add(r, ptr, ptr, IMM(delta)); CMTF("ptr += (y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i);
1326  }
1327 
1328  if (do_load) {
1329  RasmOp dither_vlq = v_q(dither_vl);
1330  RasmOp dither_vhq = v_q(dither_vh);
1331  i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ ditherl, ditherh } = *ptr;");
1332  }
1333 
1334  i_fadd (r, vl[i], vl[i], dither_vl); CMTF("vl[%u] += vditherl;", i);
1335  if (s->use_vh) {
1336  i_fadd(r, vh[i], vh[i], dither_vh); CMTF("vh[%u] += vditherh;", i);
1337  }
1338 
1339  last_y_off = y_off;
1340  prev_i = i;
1341  }
1342 }
1343 
1344 /*********************************************************************/
1346 {
1347  RasmContext *r = s->rctx;
1348 
1349  char func_name[128];
1350  aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
1351  rasm_func_begin(r, func_name, true, true);
1352 
1353  /**
1354  * Set up vector register dimensions and reshape all vectors
1355  * accordingly.
1356  */
1357  size_t el_size = aarch64_pixel_size(p->type);
1358  size_t total_size = p->block_size * el_size;
1359 
1360  s->vec_size = FFMIN(total_size, 16);
1361  s->use_vh = (s->vec_size != total_size);
1362 
1363  s->el_size = el_size;
1364  s->el_count = s->vec_size / el_size;
1365  reshape_all_vectors(s, s->el_count, el_size);
1366 
1367  /* Common start for continuation-passing style (CPS) functions. */
1368  i_ldr(r, s->cont, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr cont = impl->cont;");
1369 
1370  switch (p->op) {
1381  case AARCH64_SWS_OP_UNPACK: asmgen_op_unpack(s, p); break;
1382  case AARCH64_SWS_OP_PACK: asmgen_op_pack(s, p); break;
1383  case AARCH64_SWS_OP_LSHIFT: asmgen_op_lshift(s, p); break;
1384  case AARCH64_SWS_OP_RSHIFT: asmgen_op_rshift(s, p); break;
1385  case AARCH64_SWS_OP_CLEAR: asmgen_op_clear(s, p); break;
1387  case AARCH64_SWS_OP_EXPAND: asmgen_op_expand(s, p); break;
1388  case AARCH64_SWS_OP_MIN: asmgen_op_min(s, p); break;
1389  case AARCH64_SWS_OP_MAX: asmgen_op_max(s, p); break;
1390  case AARCH64_SWS_OP_SCALE: asmgen_op_scale(s, p); break;
1391  case AARCH64_SWS_OP_LINEAR: asmgen_op_linear(s, p); break;
1392  case AARCH64_SWS_OP_DITHER: asmgen_op_dither(s, p); break;
1393  /* TODO implement AARCH64_SWS_OP_SHUFFLE */
1394  default:
1395  break;
1396  }
1397 
1398  /* Common end for CPS functions. */
1399  i_add(r, s->impl, s->impl, IMM(sizeof_impl)); CMT("impl += 1;");
1400  i_br (r, s->cont); CMT("jump to cont");
1401 }
1402 
1404 {
1405  switch (p->op) {
1407  asmgen_process(s, p);
1408  break;
1411  break;
1412  default:
1413  asmgen_op_cps(s, p);
1414  break;
1415  }
1416 }
1417 
1418 /*********************************************************************/
1419 static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params,
1420  const SwsAArch64OpImplParams *prev, const char *p_str)
1421 {
1422  int first_diff = 0;
1423  int prev_levels = 0;
1424  int levels = 0;
1425 
1426  /* Compute number of current levels. */
1427  if (params) {
1428  const ParamField **fields = op_fields[params->op];
1429  while (fields[levels])
1430  levels++;
1431  }
1432 
1433  /* Compute number of previous levels. */
1434  if (prev) {
1435  const ParamField **prev_fields = op_fields[prev->op];
1436  while (prev_fields[prev_levels])
1437  prev_levels++;
1438  }
1439 
1440  /* Walk up and check the conditions that match. */
1441  if (params && prev) {
1442  const ParamField **fields = op_fields[params->op];
1443  first_diff = -1;
1444  for (int i = 0; fields[i]; i++) {
1445  const ParamField *field = fields[i];
1446  if (first_diff < 0) {
1447  int diff = field->cmp_val((void *) (((uintptr_t) params) + field->offset),
1448  (void *) (((uintptr_t) prev) + field->offset));
1449  if (diff)
1450  first_diff = i;
1451  }
1452  }
1453  }
1454 
1455  /* Walk back closing conditions. */
1456  if (prev) {
1457  for (int i = prev_levels - 1; i > first_diff; i--) {
1458  buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), "");
1459  buf_appendf(&buf, &size, "%*s}\n", 4 * i, "");
1460  }
1461  }
1462 
1463  /* Walk up adding conditions to return current function. */
1464  if (params) {
1465  const ParamField **fields = op_fields[params->op];
1466  for (int i = first_diff; i < levels; i++) {
1467  const ParamField *field = fields[i];
1468  void *p = (void *) (((uintptr_t) params) + field->offset);
1469  buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", p_str, field->name);
1470  field->print_val(&buf, &size, p);
1471  buf_appendf(&buf, &size, ")");
1472  if (i == (levels - 1)) {
1473  buf_appendf(&buf, &size, " return ");
1474  impl_func_name(&buf, &size, params);
1475  buf_appendf(&buf, &size, ";\n");
1476  } else {
1477  buf_appendf(&buf, &size, " {\n");
1478  }
1479  }
1480  }
1481 
1482  av_assert0(size && "string buffer exhausted");
1483 }
1484 
1485 static int lookup_gen(void)
1486 {
1487  char buf[1024];
1488 
1489  /**
1490  * The lookup function matches the SwsAArch64OpImplParams from
1491  * ops_entries.c to the exported functions generated by asmgen_op().
1492  * Each call to aarch64_op_impl_lookup_str() generates a code
1493  * fragment to uniquely detect the current function, opening and/or
1494  * closing conditions depending on the parameters of the previous
1495  * function.
1496  */
1497 
1498  /* External function declarations. */
1499  printf("#include \"libswscale/aarch64/ops_lookup.h\"\n");
1500  printf("\n");
1501  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1502  aarch64_op_impl_func_name(buf, sizeof(buf), p);
1503  printf("extern void %s(void);\n", buf);
1504  }
1505  printf("\n");
1506 
1507  /* Lookup function. */
1508  printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)\n");
1509  printf("{\n");
1510  const SwsAArch64OpImplParams *prev = NULL;
1511  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1512  aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->");
1513  printf("%s", buf);
1514  prev = p;
1515  }
1516  aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->");
1517  printf("%s", buf);
1518  printf(" return NULL;\n");
1519  printf("}\n");
1520 
1521  return 0;
1522 }
1523 
1524 /*********************************************************************/
1525 
1526 /* Generate all functions described by ops_entries.c */
1527 static int asmgen(void)
1528 {
1529  RasmContext *rctx = rasm_alloc();
1530  if (!rctx)
1531  return AVERROR(ENOMEM);
1532 
1533  SwsAArch64Context s = { .rctx = rctx };
1534  int ret;
1535 
1536  /**
1537  * The entry point of the SwsOpFunc is the `process` function. The
1538  * kernel functions are chained by directly branching to the next
1539  * operation, using a continuation-passing style design. The exit
1540  * point of the SwsOpFunc is the `process_return` function.
1541  *
1542  * The GPRs used by the entire call-chain are listed below.
1543  *
1544  * Function arguments are passed in r0-r5. After the parameters
1545  * from `exec` have been read, r0 is reused to branch to the
1546  * continuation functions. After the original parameters from
1547  * `impl` have been computed, r1 is reused as the `impl` pointer
1548  * for each operation.
1549  *
1550  * Loop iterators are r6 for `bx` and r3 for `y`, reused from
1551  * `y_start`, which doesn't need to be preserved.
1552  *
1553  * The intra-procedure-call temporary registers (r16 and r17) are
1554  * used as scratch registers. They may be used by call veneers and
1555  * PLT code inserted by the linker, so we cannot expect them to
1556  * persist across branches between functions.
1557  *
1558  * The Platform Register (r18) is not used.
1559  *
1560  * The read/write data pointers and padding values first use up the
1561  * remaining free caller-saved registers, and only then are the
1562  * caller-saved registers (r19-r28) used.
1563  */
1564 
1565  /* SwsOpFunc arguments. */
1566  s.exec = a64op_gpx(0); // const SwsOpExec *exec
1567  s.impl = a64op_gpx(1); // const void *priv
1568  s.bx_start = a64op_gpw(2); // int bx_start
1569  s.y_start = a64op_gpw(3); // int y_start
1570  s.bx_end = a64op_gpw(4); // int bx_end
1571  s.y_end = a64op_gpw(5); // int y_end
1572 
1573  /* Loop iterator variables. */
1574  s.bx = a64op_gpw(6);
1575  s.y = s.y_start; /* Reused from SwsOpFunc argument. */
1576 
1577  /* Scratch registers. */
1578  s.tmp0 = a64op_gpx(16); /* IP0 */
1579  s.tmp1 = a64op_gpx(17); /* IP1 */
1580 
1581  /* CPS-related variables. */
1582  s.op0_func = a64op_gpx(7);
1583  s.op1_impl = a64op_gpx(8);
1584  s.cont = s.exec; /* Reused from SwsOpFunc argument. */
1585 
1586  /* Read/Write data pointers and padding. */
1587  s.in [0] = a64op_gpx(9);
1588  s.out [0] = a64op_gpx(10);
1589  s.in_bump [0] = a64op_gpx(11);
1590  s.out_bump[0] = a64op_gpx(12);
1591  s.in [1] = a64op_gpx(13);
1592  s.out [1] = a64op_gpx(14);
1593  s.in_bump [1] = a64op_gpx(15);
1594  s.out_bump[1] = a64op_gpx(19);
1595  s.in [2] = a64op_gpx(20);
1596  s.out [2] = a64op_gpx(21);
1597  s.in_bump [2] = a64op_gpx(22);
1598  s.out_bump[2] = a64op_gpx(23);
1599  s.in [3] = a64op_gpx(24);
1600  s.out [3] = a64op_gpx(25);
1601  s.in_bump [3] = a64op_gpx(26);
1602  s.out_bump[3] = a64op_gpx(27);
1603 
1604  /* Generate all functions from ops_entries.c using rasm. */
1605  const SwsAArch64OpImplParams *params = impl_params;
1606  while (params->op) {
1607  asmgen_op(&s, params++);
1608  if (rctx->error) {
1609  ret = rctx->error;
1610  goto error;
1611  }
1612  }
1613 
1614  /* Print all rasm functions to stdout. */
1615  printf("#include \"libavutil/aarch64/asm.S\"\n");
1616  printf("\n");
1617  ret = rasm_print(s.rctx, stdout);
1618 
1619 error:
1620  rasm_free(&s.rctx);
1621  return ret;
1622 }
1623 
1624 /*********************************************************************/
1625 int main(int argc, char *argv[])
1626 {
1627  bool lookup = false;
1628  bool ops = false;
1629 
1630 #ifdef _WIN32
1631  _setmode(_fileno(stdout), _O_BINARY);
1632 #endif
1633 
1634  for (int i = 1; i < argc; i++) {
1635  if (!strcmp(argv[i], "-ops"))
1636  ops = true;
1637  else if (!strcmp(argv[i], "-lookup"))
1638  lookup = true;
1639  }
1640  if ((lookup && ops) || (!lookup && !ops)) {
1641  fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n");
1642  return -1;
1643  }
1644 
1645  return lookup ? lookup_gen() : asmgen();
1646 }
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
asmgen_op_write_planar
static void asmgen_op_write_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:612
AARCH64_SWS_OP_MIN
@ AARCH64_SWS_OP_MIN
Definition: ops_impl.h:59
AArch64VecViews::h8
RasmOp h8
Definition: rasm.h:463
linear_index_to_vx
static int linear_index_to_vx(int idx)
Definition: ops_impl.h:157
rasm_print.c
FF_DYNARRAY_ADD
#define FF_DYNARRAY_ADD(av_size_max, av_elt_size, av_array, av_size, av_success, av_failure)
Add an element to a dynamic array.
Definition: dynarray.h:45
LINEAR_MASK_GET
#define LINEAR_MASK_GET(mask, idx, jdx)
Definition: ops_impl.h:124
SwsAArch64Context::vh
RasmOp vh[4]
Definition: ops_asmgen.c:158
rasm_alloc
RasmContext * rasm_alloc(void)
Definition: rasm.c:32
r
const char * r
Definition: vf_curves.c:127
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
printf
__device__ int printf(const char *,...)
main
int main(int argc, char *argv[])
Definition: ops_asmgen.c:1625
LOOP_MASK_BWD_VH
#define LOOP_MASK_BWD_VH(s, p, idx)
Definition: ops_asmgen.c:180
i_ld1
#define i_ld1(rctx, op0, op1)
Definition: rasm.h:552
LOOP_MASK_BWD
#define LOOP_MASK_BWD(p, idx)
Definition: ops_impl.h:122
ParamField
The following structure is used to describe one field from SwsAArch64OpImplParams.
Definition: ops_impl.c:190
reshape_all_vectors
static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
Definition: ops_asmgen.c:187
a64op_base
static RasmOp a64op_base(RasmOp op)
Definition: rasm.h:495
AArch64VecViews::b16
RasmOp b16
Definition: rasm.h:461
i_zip1
#define i_zip1(rctx, op0, op1, op2)
Definition: rasm.h:591
AArch64VecViews::d
RasmOp d
Definition: rasm.h:457
i_ld4
#define i_ld4(rctx, op0, op1)
Definition: rasm.h:556
i_mul
#define i_mul(rctx, op0, op1, op2)
Definition: rasm.h:564
a64op_gpx
static RasmOp a64op_gpx(uint8_t n)
Definition: rasm.h:352
av_dynarray2_add
static void * av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size, const uint8_t *elem_data)
Definition: ops_asmgen.c:65
asmgen_op_read_packed
static void asmgen_op_read_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:477
a64op_w
static RasmOp a64op_w(RasmOp op)
Definition: rasm.h:356
RasmContext::error
int error
Definition: rasm.h:191
clobber_gpr
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count, RasmOp gpr)
Definition: ops_asmgen.c:265
SwsAArch64Context::out_bump
RasmOp out_bump[4]
Definition: ops_asmgen.c:165
rasm_free
void rasm_free(RasmContext **prctx)
Definition: rasm.c:37
rasm_set_current_node
RasmNode * rasm_set_current_node(RasmContext *rctx, RasmNode *node)
Definition: rasm.c:199
i_st4
#define i_st4(rctx, op0, op1)
Definition: rasm.h:573
u
#define u(width, name, range_min, range_max)
Definition: cbs_apv.c:68
asmgen_op_max
static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1042
AArch64VecViews
This helper structure is used to mimic the assembler syntax for vector register modifiers.
Definition: rasm.h:452
SwsAArch64Context::rctx
RasmContext * rctx
Definition: ops_asmgen.c:133
rasm_get_current_node
RasmNode * rasm_get_current_node(RasmContext *rctx)
Definition: rasm.c:194
asmgen_op_write_bit
static void asmgen_op_write_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:516
a64op_gpw
static RasmOp a64op_gpw(uint8_t n)
Definition: rasm.h:351
i_ld3
#define i_ld3(rctx, op0, op1)
Definition: rasm.h:555
vv_2
static RasmOp vv_2(RasmOp op0, RasmOp op1)
Definition: rasm.h:443
vv_3
static RasmOp vv_3(RasmOp op0, RasmOp op1, RasmOp op2)
Definition: rasm.h:444
CMTF
#define CMTF(fmt,...)
Definition: ops_asmgen.c:184
PRINT_SWIZZLE_V
#define PRINT_SWIZZLE_V(n, vh)
Definition: ops_asmgen.c:674
AARCH64_SWS_OP_SWIZZLE
@ AARCH64_SWS_OP_SWIZZLE
Definition: ops_impl.h:51
i_dup
#define i_dup(rctx, op0, op1)
Definition: rasm.h:544
asmgen_op_dither
static void asmgen_op_dither(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1224
SwsAArch64Context::y
RasmOp y
Definition: ops_asmgen.c:145
i_bne
#define i_bne(rctx, id)
Definition: rasm.h:596
rasm_print
void int rasm_print(RasmContext *rctx, FILE *fp)
Definition: rasm_print.c:430
SwsAArch64Context::out
RasmOp out[4]
Definition: ops_asmgen.c:163
i_ld2
#define i_ld2(rctx, op0, op1)
Definition: rasm.h:554
i_fmla
#define i_fmla(rctx, op0, op1, op2)
Definition: rasm.h:549
asmgen_op_read_bit
static void asmgen_op_read_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:387
RasmNode
Definition: rasm.h:144
i_rev16
#define i_rev16(rctx, op0, op1)
Definition: rasm.h:567
ops_impl.c
asmgen_op_write_nibble
static void asmgen_op_write_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:546
IMM
#define IMM(val)
Definition: rasm.h:91
AARCH64_SWS_OP_CLEAR
@ AARCH64_SWS_OP_CLEAR
Definition: ops_impl.h:56
SwsAArch64Context::el_size
size_t el_size
Definition: ops_asmgen.c:168
asmgen_process_return
static void asmgen_process_return(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:337
i_fmin
#define i_fmin(rctx, op0, op1, op2)
Definition: rasm.h:548
SwsAArch64Context::bx_start
RasmOp bx_start
Definition: ops_asmgen.c:138
AARCH64_SWS_OP_NONE
@ AARCH64_SWS_OP_NONE
Definition: ops_impl.h:39
LOOP_MASK_VH
#define LOOP_MASK_VH(s, p, idx)
Definition: ops_asmgen.c:179
a64op_vec_views
void a64op_vec_views(RasmOp op, AArch64VecViews *out)
Definition: rasm.c:330
SwsAArch64Context::bx_end
RasmOp bx_end
Definition: ops_asmgen.c:140
AARCH64_SWS_OP_READ_NIBBLE
@ AARCH64_SWS_OP_READ_NIBBLE
Definition: ops_impl.h:43
SwsAArch64Context
Definition: ops_asmgen.c:132
AArch64VecViews::s
RasmOp s
Definition: rasm.h:456
AARCH64_SWS_OP_PACK
@ AARCH64_SWS_OP_PACK
Definition: ops_impl.h:53
AARCH64_SWS_OP_SWAP_BYTES
@ AARCH64_SWS_OP_SWAP_BYTES
Definition: ops_impl.h:50
AARCH64_SWS_OP_READ_BIT
@ AARCH64_SWS_OP_READ_BIT
Definition: ops_impl.h:42
i_st2
#define i_st2(rctx, op0, op1)
Definition: rasm.h:571
SwsAArch64Context::impl
RasmOp impl
Definition: ops_asmgen.c:137
i_st3
#define i_st3(rctx, op0, op1)
Definition: rasm.h:572
i_ushr
#define i_ushr(rctx, op0, op1, op2)
Definition: rasm.h:587
RasmOp
Runtime assembler for AArch64.
Definition: rasm.h:43
AARCH64_SWS_OP_MAX
@ AARCH64_SWS_OP_MAX
Definition: ops_impl.h:60
asmgen_op_read_nibble
static void asmgen_op_read_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:423
i_addv
#define i_addv(rctx, op0, op1)
Definition: rasm.h:537
asmgen_op_clear
static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:885
swizzle_a64op
static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:676
val
static double val(void *priv, double ch)
Definition: aeval.c:77
rasm_add_label
RasmNode * rasm_add_label(RasmContext *rctx, int id)
Definition: rasm.c:146
loop
static int loop
Definition: ffplay.c:337
i_fadd
#define i_fadd(rctx, op0, op1, op2)
Definition: rasm.h:545
asmgen_op_pack
static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:819
LINEAR_MASK_1
#define LINEAR_MASK_1
Definition: ops_impl.h:129
i_ld1r
#define i_ld1r(rctx, op0, op1)
Definition: rasm.h:553
SwsAArch64Context::y_start
RasmOp y_start
Definition: ops_asmgen.c:139
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
SwsAArch64Context::exec
RasmOp exec
Definition: ops_asmgen.c:136
vv_1
static RasmOp vv_1(RasmOp op0)
Definition: rasm.h:442
a64op_elem
static RasmOp a64op_elem(RasmOp op, uint8_t idx)
Definition: rasm.h:418
FFMIN
#define FFMIN(a, b)
Definition: ops_asmgen.c:50
float
float
Definition: af_crystalizer.c:122
AArch64VecViews::be
RasmOp be[2]
Definition: rasm.h:468
asmgen_op
static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1403
AARCH64_SWS_OP_WRITE_NIBBLE
@ AARCH64_SWS_OP_WRITE_NIBBLE
Definition: ops_impl.h:47
AArch64VecViews::b
RasmOp b
Definition: rasm.h:454
AARCH64_SWS_OP_DITHER
@ AARCH64_SWS_OP_DITHER
Definition: ops_impl.h:63
AARCH64_SWS_OP_RSHIFT
@ AARCH64_SWS_OP_RSHIFT
Definition: ops_impl.h:55
s
#define s(width, name)
Definition: cbs_vp9.c:198
offsets
static const int offsets[]
Definition: hevc_pel.c:34
SwsAArch64Context::op1_impl
RasmOp op1_impl
Definition: ops_asmgen.c:153
impl_func_name
static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:113
AARCH64_SWS_OP_LINEAR
@ AARCH64_SWS_OP_LINEAR
Definition: ops_impl.h:62
LOOP
#define LOOP(mask, idx)
Definition: ops_impl.h:114
frame_size
int frame_size
Definition: mxfenc.c:2487
asmgen_op_cps
static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1345
FFMAX
#define FFMAX(a, b)
Definition: ops_asmgen.c:49
SwsAArch64Context::vl
RasmOp vl[4]
Definition: ops_asmgen.c:157
offsetof_impl_cont
#define offsetof_impl_cont
Definition: ops_impl.h:175
CMT
#define CMT(comment)
Definition: ops_asmgen.c:183
linear_pass
static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vt, RasmOp *vc, int save_mask, bool vh_pass)
Performs one pass of the linear transform over a single vector bank (low or high).
Definition: ops_asmgen.c:1094
ops_entries.c
AARCH64_SWS_OP_CONVERT
@ AARCH64_SWS_OP_CONVERT
Definition: ops_impl.h:57
limits.h
i_ins
#define i_ins(rctx, op0, op1)
Definition: rasm.h:551
asmgen_op_convert
static void asmgen_op_convert(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:908
field
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this field
Definition: writing_filters.txt:78
v_8b
static RasmOp v_8b(RasmOp op)
Definition: rasm.h:433
i_ldr
#define i_ldr(rctx, op0, op1)
Definition: rasm.h:558
i_beq
#define i_beq(rctx, id)
Definition: rasm.h:595
AARCH64_SWS_OP_PROCESS
@ AARCH64_SWS_OP_PROCESS
Definition: ops_impl.h:40
a64op_make_vec
static RasmOp a64op_make_vec(uint8_t n, uint8_t el_count, uint8_t el_size)
Definition: rasm.h:362
op_fields
static const ParamField * op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS]
Definition: ops_impl.c:327
AARCH64_PIXEL_F32
@ AARCH64_PIXEL_F32
Definition: ops_impl.h:33
asmgen_op_scale
static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1066
asmgen
static int asmgen(void)
Definition: ops_asmgen.c:1527
fields
the definition of that something depends on the semantic of the filter The callback must examine the status of the filter s links and proceed accordingly The status of output links is stored in the status_in and status_out fields and tested by the then the processing requires a frame on this link and the filter is expected to make efforts in that direction The status of input links is stored by the fifo and status_out fields
Definition: filter_design.txt:155
aarch64_op_impl_func_name
void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:125
aarch64_op_impl_lookup_str
static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params, const SwsAArch64OpImplParams *prev, const char *p_str)
Definition: ops_asmgen.c:1419
i_br
#define i_br(rctx, op0)
Definition: rasm.h:541
i_cmp
#define i_cmp(rctx, op0, op1)
Definition: rasm.h:542
AARCH64_SWS_OP_SCALE
@ AARCH64_SWS_OP_SCALE
Definition: ops_impl.h:61
asmgen_op_read_packed_1
static void asmgen_op_read_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:449
offsetof_exec_out_bump
#define offsetof_exec_out_bump
Definition: ops_impl.h:174
NULL
#define NULL
Definition: coverity.c:32
aarch64_pixel_size
static size_t aarch64_pixel_size(SwsAArch64PixelType fmt)
Definition: ops_asmgen.c:99
impl_params
static const SwsAArch64OpImplParams impl_params[]
Implementation parameters for all exported functions.
Definition: ops_asmgen.c:93
asmgen_op_unpack
static void asmgen_op_unpack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:747
i_fmul
#define i_fmul(rctx, op0, op1, op2)
Definition: rasm.h:550
asmgen_op_write_packed_n
static void asmgen_op_write_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:590
a64op_post
static RasmOp a64op_post(RasmOp op, int16_t imm)
Definition: rasm.h:498
AARCH64_SWS_OP_READ_PACKED
@ AARCH64_SWS_OP_READ_PACKED
Definition: ops_impl.h:44
i_umin
#define i_umin(rctx, op0, op1, op2)
Definition: rasm.h:582
MAX_SAVED_REGS
#define MAX_SAVED_REGS
Definition: ops_asmgen.c:263
LOOP_VH
#define LOOP_VH(s, mask, idx)
Definition: ops_asmgen.c:178
offsetof_exec_out
#define offsetof_exec_out
Definition: ops_impl.h:172
sizeof_impl
#define sizeof_impl
Definition: ops_impl.h:177
i_add
#define i_add(rctx, op0, op1, op2)
Definition: rasm.h:536
asmgen_op_read_planar
static void asmgen_op_read_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:488
rasm.c
rasm_new_label
int rasm_new_label(RasmContext *rctx, const char *name)
Allocate a new label ID with the given name.
Definition: rasm.c:282
SwsAArch64Context::tmp0
RasmOp tmp0
Definition: ops_asmgen.c:148
a64op_sp
static RasmOp a64op_sp(void)
Definition: rasm.h:353
SwsAArch64Context::vt
RasmOp vt[12]
Definition: ops_asmgen.c:159
AARCH64_SWS_OP_WRITE_PLANAR
@ AARCH64_SWS_OP_WRITE_PLANAR
Definition: ops_impl.h:49
i_uxtl
#define i_uxtl(rctx, op0, op1)
Definition: rasm.h:588
SwsAArch64Context::use_vh
bool use_vh
Definition: ops_asmgen.c:171
LOOP_MASK
#define LOOP_MASK(p, idx)
Definition: ops_impl.h:121
AARCH64_SWS_OP_LSHIFT
@ AARCH64_SWS_OP_LSHIFT
Definition: ops_impl.h:54
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
i_ldrb
#define i_ldrb(rctx, op0, op1)
Definition: rasm.h:559
i_shl
#define i_shl(rctx, op0, op1, op2)
Definition: rasm.h:569
i_fmax
#define i_fmax(rctx, op0, op1, op2)
Definition: rasm.h:547
size
int size
Definition: twinvq_data.h:10344
AArch64VecViews::h
RasmOp h
Definition: rasm.h:455
i_zip2
#define i_zip2(rctx, op0, op1, op2)
Definition: rasm.h:592
SwsAArch64OpImplParams::op
SwsAArch64OpType op
Definition: ops_impl.h:95
SwsAArch64Context::vec_size
size_t vec_size
Definition: ops_asmgen.c:170
i_fcvtzu
#define i_fcvtzu(rctx, op0, op1)
Definition: rasm.h:546
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:166
asmgen_op_read_packed_n
static void asmgen_op_read_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:466
i_ucvtf
#define i_ucvtf(rctx, op0, op1)
Definition: rasm.h:580
AARCH64_SWS_OP_WRITE_BIT
@ AARCH64_SWS_OP_WRITE_BIT
Definition: ops_impl.h:46
a64op_off
static RasmOp a64op_off(RasmOp op, int16_t imm)
Definition: rasm.h:496
AARCH64_SWS_OP_READ_PLANAR
@ AARCH64_SWS_OP_READ_PLANAR
Definition: ops_impl.h:45
av_freep
static void av_freep(void *ptr)
Definition: ops_asmgen.c:52
AARCH64_SWS_OP_EXPAND
@ AARCH64_SWS_OP_EXPAND
Definition: ops_impl.h:58
i_uxtl2
#define i_uxtl2(rctx, op0, op1)
Definition: rasm.h:589
AArch64VecViews::de
RasmOp de[2]
Definition: rasm.h:469
SwsAArch64Context::tmp1
RasmOp tmp1
Definition: ops_asmgen.c:149
AARCH64_SWS_OP_UNPACK
@ AARCH64_SWS_OP_UNPACK
Definition: ops_impl.h:52
vv_4
static RasmOp vv_4(RasmOp op0, RasmOp op1, RasmOp op2, RasmOp op3)
Definition: rasm.h:445
i_lsr
#define i_lsr(rctx, op0, op1, op2)
Definition: rasm.h:561
rasm_annotate_next
void rasm_annotate_next(RasmContext *rctx, const char *comment)
Definition: rasm.c:263
clobbered_frame_size
static unsigned clobbered_frame_size(unsigned n)
Definition: ops_asmgen.c:214
RasmContext
Definition: rasm.h:184
lookup
int lookup
Definition: vorbis_enc_data.h:428
i_ldp
#define i_ldp(rctx, op0, op1, op2)
Definition: rasm.h:557
delta
float delta
Definition: vorbis_enc_data.h:430
asmgen_op_swap_bytes
static void asmgen_op_swap_bytes(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:637
AArch64VecViews::q
RasmOp q
Definition: rasm.h:458
offsetof_impl_priv
#define offsetof_impl_priv
Definition: ops_impl.h:176
asmgen_op_write_packed
static void asmgen_op_write_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:601
AArch64VecViews::h4
RasmOp h4
Definition: rasm.h:462
i_xtn
#define i_xtn(rctx, op0, op1)
Definition: rasm.h:590
SwsAArch64Context::in
RasmOp in[4]
Definition: ops_asmgen.c:162
av_assert0
#define av_assert0(cond)
Definition: ops_asmgen.c:43
a64op_pre
static RasmOp a64op_pre(RasmOp op, int16_t imm)
Definition: rasm.h:497
SWIZZLE_TMP
#define SWIZZLE_TMP
Definition: ops_asmgen.c:664
asmgen_op_lshift
static void asmgen_op_lshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:857
clobbered_gprs
static unsigned clobbered_gprs(const SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp regs[MAX_SAVED_REGS])
Definition: ops_asmgen.c:273
SwsAArch64Context::el_count
size_t el_count
Definition: ops_asmgen.c:169
rasm_annotate_nextf
void rasm_annotate_nextf(RasmContext *rctx, char *s, size_t n, const char *fmt,...)
Definition: rasm.c:273
i_umax
#define i_umax(rctx, op0, op1, op2)
Definition: rasm.h:581
ret
ret
Definition: filter_design.txt:187
asmgen_process
static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:287
offsetof_exec_in
#define offsetof_exec_in
These values will be used by ops_asmgen to access fields inside of SwsOpExec and SwsOpImpl.
Definition: ops_impl.h:171
MASK_SET
#define MASK_SET(mask, idx, val)
Definition: ops_impl.h:112
rasm_func_begin
int rasm_func_begin(RasmContext *rctx, const char *name, bool export, bool jumpable)
Definition: rasm.c:209
i_mov16b
#define i_mov16b(rctx, op0, op1)
Definition: rasm.h:613
AARCH64_PIXEL_U8
@ AARCH64_PIXEL_U8
Definition: ops_impl.h:30
SwsAArch64Context::cont
RasmOp cont
Definition: ops_asmgen.c:154
lookup_gen
static int lookup_gen(void)
Definition: ops_asmgen.c:1485
SwsAArch64Context::in_bump
RasmOp in_bump[4]
Definition: ops_asmgen.c:164
AARCH64_PIXEL_U32
@ AARCH64_PIXEL_U32
Definition: ops_impl.h:32
MASK_GET
#define MASK_GET(mask, idx)
Definition: ops_impl.h:111
i_str
#define i_str(rctx, op0, op1)
Definition: rasm.h:575
dynarray.h
swizzle_emit
static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
Definition: ops_asmgen.c:683
i_and
#define i_and(rctx, op0, op1, op2)
Definition: rasm.h:539
i_ldrh
#define i_ldrh(rctx, op0, op1)
Definition: rasm.h:560
asmgen_op_write_packed_1
static void asmgen_op_write_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:573
SwsAArch64Context::op0_func
RasmOp op0_func
Definition: ops_asmgen.c:152
asmgen_prologue
static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:219
asmgen_op_expand
static void asmgen_op_expand(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:986
i_ubfiz
#define i_ubfiz(rctx, op0, op1, op2, op3)
Definition: rasm.h:579
asmgen_epilogue
static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:240
Windows::Graphics::DirectX::Direct3D11::p
IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p
Definition: vsrc_gfxcapture_winrt.hpp:53
i_orr
#define i_orr(rctx, op0, op1, op2)
Definition: rasm.h:565
AARCH64_SWS_OP_WRITE_PACKED
@ AARCH64_SWS_OP_WRITE_PACKED
Definition: ops_impl.h:48
SwsAArch64OpImplParams
SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType operation.
Definition: ops_impl.h:94
i_rev32
#define i_rev32(rctx, op0, op1)
Definition: rasm.h:568
i_stp
#define i_stp(rctx, op0, op1, op2)
Definition: rasm.h:574
asmgen_op_min
static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1018
AARCH64_SWS_OP_PROCESS_RETURN
@ AARCH64_SWS_OP_PROCESS_RETURN
Definition: ops_impl.h:41
i_movi
#define i_movi(rctx, op0, op1)
Definition: rasm.h:563
i_ret
#define i_ret(rctx)
Definition: rasm.h:566
AARCH64_PIXEL_U16
@ AARCH64_PIXEL_U16
Definition: ops_impl.h:31
a64op_x
static RasmOp a64op_x(RasmOp op)
Definition: rasm.h:357
asmgen_op_linear
static void asmgen_op_linear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1180
offsetof_exec_in_bump
#define offsetof_exec_in_bump
Definition: ops_impl.h:173
SwsAArch64Context::bx
RasmOp bx
Definition: ops_asmgen.c:144
asmgen_op_rshift
static void asmgen_op_rshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:871
linear_index_is_offset
static int linear_index_is_offset(int idx)
Definition: ops_impl.h:152
linear_num_vregs
static int linear_num_vregs(const SwsAArch64OpImplParams *params)
Definition: ops_impl.h:138
SwsAArch64PixelType
SwsAArch64PixelType
Definition: ops_impl.h:29
print_swizzle_v
static const char * print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:666
width
#define width
Definition: dsp.h:89
asmgen_op_swizzle
static void asmgen_op_swizzle(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:695
SwsAArch64Context::y_end
RasmOp y_end
Definition: ops_asmgen.c:141
snprintf
#define snprintf
Definition: snprintf.h:34
i_ushl
#define i_ushl(rctx, op0, op1, op2)
Definition: rasm.h:584
rasm_add_comment
RasmNode * rasm_add_comment(RasmContext *rctx, const char *comment)
Definition: rasm.c:117
src
#define src
Definition: vp8dsp.c:248
a64op_gpr_n
static uint8_t a64op_gpr_n(RasmOp op)
Definition: rasm.h:348
i_mov
#define i_mov(rctx, op0, op1)
Definition: rasm.h:562
v_q
static RasmOp v_q(RasmOp op)
Definition: rasm.h:430
AArch64VecViews::b8
RasmOp b8
Definition: rasm.h:460