|
FFmpeg
|
#include <assert.h>#include <limits.h>#include <stdint.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "libavutil/dynarray.h"#include "rasm.c"#include "rasm_print.c"#include "ops_impl.c"#include "ops_entries.c"Go to the source code of this file.
Data Structures | |
| struct | SwsAArch64Context |
Macros | |
| #define | AVUTIL_AVASSERT_H |
| This file is compiled as a standalone build-time tool and must not depend on internal FFmpeg libraries. More... | |
| #define | AVUTIL_LOG_H |
| #define | AVUTIL_MACROS_H |
| #define | AVUTIL_MEM_H |
| #define | av_assert0(cond) assert(cond) |
| #define | av_malloc(s) malloc(s) |
| #define | av_mallocz(s) calloc(1, s) |
| #define | av_realloc(p, s) realloc(p, s) |
| #define | av_strdup(s) strdup(s) |
| #define | av_free(p) free(p) |
| #define | FFMAX(a, b) ((a) > (b) ? (a) : (b)) |
| #define | FFMIN(a, b) ((a) > (b) ? (b) : (a)) |
| #define | LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx) |
| #define | LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx) |
| #define | LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx) |
| #define | CMT(comment) rasm_annotate(r, comment) |
| #define | CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__) |
| #define | MAX_SAVED_REGS 10 |
| #define | SWIZZLE_TMP 0xf |
| #define | PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh) |
Variables | |
| static const SwsAArch64OpImplParams | impl_params [] |
| Implementation parameters for all exported functions. More... | |
| #define AVUTIL_AVASSERT_H |
This file is compiled as a standalone build-time tool and must not depend on internal FFmpeg libraries.
The necessary utils are redefined below using standard C equivalents.
Definition at line 39 of file ops_asmgen.c.
| #define AVUTIL_LOG_H |
Definition at line 40 of file ops_asmgen.c.
| #define AVUTIL_MACROS_H |
Definition at line 41 of file ops_asmgen.c.
| #define AVUTIL_MEM_H |
Definition at line 42 of file ops_asmgen.c.
Definition at line 43 of file ops_asmgen.c.
Definition at line 44 of file ops_asmgen.c.
Definition at line 45 of file ops_asmgen.c.
Definition at line 46 of file ops_asmgen.c.
Definition at line 47 of file ops_asmgen.c.
| #define av_free | ( | p | ) | free(p) |
Definition at line 48 of file ops_asmgen.c.
Definition at line 178 of file ops_asmgen.c.
Definition at line 179 of file ops_asmgen.c.
| #define LOOP_MASK_BWD_VH | ( | s, | |
| p, | |||
| idx | |||
| ) | if (s->use_vh) LOOP_MASK_BWD(p, idx) |
Definition at line 180 of file ops_asmgen.c.
| #define CMT | ( | comment | ) | rasm_annotate(r, comment) |
Definition at line 183 of file ops_asmgen.c.
| #define CMTF | ( | fmt, | |
| ... | |||
| ) | rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__) |
Definition at line 184 of file ops_asmgen.c.
| #define MAX_SAVED_REGS 10 |
Definition at line 263 of file ops_asmgen.c.
| #define SWIZZLE_TMP 0xf |
Definition at line 664 of file ops_asmgen.c.
| #define PRINT_SWIZZLE_V | ( | n, | |
| vh | |||
| ) | print_swizzle_v((char[8]){ 0 }, n, vh) |
Definition at line 674 of file ops_asmgen.c.
|
static |
Definition at line 52 of file ops_asmgen.c.
Referenced by av_dynarray2_add().
|
static |
Definition at line 65 of file ops_asmgen.c.
|
static |
Definition at line 99 of file ops_asmgen.c.
Referenced by asmgen_op_convert(), asmgen_op_cps(), asmgen_op_expand(), and asmgen_op_swap_bytes().
|
static |
Definition at line 113 of file ops_asmgen.c.
Referenced by aarch64_op_impl_func_name(), and aarch64_op_impl_lookup_str().
| void aarch64_op_impl_func_name | ( | char * | buf, |
| size_t | size, | ||
| const SwsAArch64OpImplParams * | params | ||
| ) |
Definition at line 125 of file ops_asmgen.c.
Referenced by asmgen_op_cps(), asmgen_process(), asmgen_process_return(), and lookup_gen().
|
static |
Definition at line 187 of file ops_asmgen.c.
Referenced by asmgen_op_cps(), asmgen_op_expand(), asmgen_op_pack(), and asmgen_op_unpack().
|
static |
Definition at line 214 of file ops_asmgen.c.
Referenced by asmgen_epilogue(), and asmgen_prologue().
|
static |
Definition at line 219 of file ops_asmgen.c.
Referenced by asmgen_process().
|
static |
Definition at line 240 of file ops_asmgen.c.
Referenced by asmgen_process_return().
Definition at line 265 of file ops_asmgen.c.
Referenced by clobbered_gprs().
|
static |
Definition at line 273 of file ops_asmgen.c.
Referenced by asmgen_process(), and asmgen_process_return().
|
static |
The process/process_return functions for aarch64 work similarly to the x86 backend. The description in x86/ops_common.asm mostly holds as well here.
Definition at line 287 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Definition at line 337 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Definition at line 387 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 423 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 449 of file ops_asmgen.c.
Referenced by asmgen_op_read_packed().
|
static |
Definition at line 466 of file ops_asmgen.c.
Referenced by asmgen_op_read_packed().
|
static |
Definition at line 477 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 488 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 516 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 546 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 573 of file ops_asmgen.c.
Referenced by asmgen_op_write_packed().
|
static |
Definition at line 590 of file ops_asmgen.c.
Referenced by asmgen_op_write_packed().
|
static |
Definition at line 601 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 612 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 637 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 666 of file ops_asmgen.c.
|
static |
Definition at line 676 of file ops_asmgen.c.
Referenced by swizzle_emit().
|
static |
Definition at line 683 of file ops_asmgen.c.
Referenced by asmgen_op_swizzle().
|
static |
Definition at line 695 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
All-one values in movi only work up to 8-bit, and then at full 16- or 32-bit, but not for intermediate values like 10-bit. In those cases, we use mov + dup instead.
Definition at line 747 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 819 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 857 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 871 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
TODO
Definition at line 885 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Since each instruction in the convert operation needs specific element types, it is simpler to use arrangement specifiers for each operand instead of reshaping all vectors.
This function assumes block_size is either 8 or 16, and that we're always using the most amount of vector registers possible. Therefore, u32 always uses the high vector bank.
Definition at line 908 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 986 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1018 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1042 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1066 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Performs one pass of the linear transform over a single vector bank (low or high).
The intermediate registers for fmul+fadd (for when SWS_BITEXACT is set) start from temp vector 4.
Save rows that need to be used as input after they have been already written to.
The non-zero coefficients have been packed in aarch64_setup_linear() in sequential order into the individual lanes of the coefficient vector registers. We must follow the same order of execution here.
Split the multiply-accumulate into fmul+fadd. All multiplications are performed first into temporary registers, and only then added to the destination, to reduce the dependency chain. There is no need to perform multiplications by 1.
Most modern aarch64 cores have a fastpath for sequences of fmla instructions. This means that even if the coefficient is 1, it is still faster to use fmla by 1 instead of fadd.
Definition at line 1094 of file ops_asmgen.c.
Referenced by asmgen_op_linear().
|
static |
Definition at line 1180 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
For a description of the matrix buffer layout, read the comments in aarch64_setup_dither() in aarch64/ops.c.
Sort components by y_offset value so that we can start dithering with the smallest value, and increment the pointer upwards for each new offset. The dither matrix is over-allocated and may be over-read at the top, but it cannot be over-read before the start of the buffer. Since we only mask the y offset once, this would be an issue if we tried to subtract a value larger than the initial y_offset.
We use ubfiz to mask and shift left in one single instruction: ubfiz <Wd>, <Wn>, #<lsb>, #<width> Wd = (Wn & ((1 << width) - 1)) << lsb;
Given: block_size = 8, log2(block_size) = 3 dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111 sizeof(float) = 4, log2(sizeof(float)) = 2
Suppose we have bx = 0bvvvv. To get x, we left shift by log2(block_size) and end up with 0bvvvv000. Then we mask against dither_mask, and end up with 0bv000. Finally we multiply by sizeof(float), which is the same as shifting left by log2(sizeof(float)). The result is 0bv00000.
Therefore: width = log2(dither_size) - log2(block_size) lsb = log2(block_size) + log2(sizeof(float))
The ubfiz instruction for the y offset performs masking by the dither matrix size and shifts by the stride.
On subsequent runs, just increment the pointer. The matrix is over-allocated, so we don't risk overreading.
Definition at line 1224 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Set up vector register dimensions and reshape all vectors accordingly.
Definition at line 1345 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Definition at line 1403 of file ops_asmgen.c.
Referenced by asmgen().
|
static |
Definition at line 1419 of file ops_asmgen.c.
Referenced by lookup_gen().
|
static |
The lookup function matches the SwsAArch64OpImplParams from ops_entries.c to the exported functions generated by asmgen_op(). Each call to aarch64_op_impl_lookup_str() generates a code fragment to uniquely detect the current function, opening and/or closing conditions depending on the parameters of the previous function.
Definition at line 1485 of file ops_asmgen.c.
Referenced by main().
|
static |
The entry point of the SwsOpFunc is the process function. The kernel functions are chained by directly branching to the next operation, using a continuation-passing style design. The exit point of the SwsOpFunc is the process_return function.
The GPRs used by the entire call-chain are listed below.
Function arguments are passed in r0-r5. After the parameters from exec have been read, r0 is reused to branch to the continuation functions. After the original parameters from impl have been computed, r1 is reused as the impl pointer for each operation.
Loop iterators are r6 for bx and r3 for y, reused from y_start, which doesn't need to be preserved.
The intra-procedure-call temporary registers (r16 and r17) are used as scratch registers. They may be used by call veneers and PLT code inserted by the linker, so we cannot expect them to persist across branches between functions.
The Platform Register (r18) is not used.
The read/write data pointers and padding values first use up the remaining free caller-saved registers, and only then are the caller-saved registers (r19-r28) used.
Definition at line 1527 of file ops_asmgen.c.
Referenced by main().
| int main | ( | int | argc, |
| char * | argv[] | ||
| ) |
Definition at line 1625 of file ops_asmgen.c.
|
static |
Implementation parameters for all exported functions.
This list is compiled by performing a dummy run of all conversions in sws_ops and collecting all functions that need to be generated. This is achieved by running: make sws_ops_entries_aarch64
Definition at line 93 of file ops_asmgen.c.
Referenced by asmgen(), and lookup_gen().
1.8.17