|
FFmpeg
|
#include <assert.h>#include <limits.h>#include <stdint.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "libavutil/dynarray.h"#include "rasm.c"#include "rasm_print.c"#include "ops_impl.c"#include "ops_entries.c"Go to the source code of this file.
Data Structures | |
| struct | SwsAArch64Context |
Macros | |
| #define | AVUTIL_AVASSERT_H |
| This file is compiled as a standalone build-time tool and must not depend on internal FFmpeg libraries. More... | |
| #define | AVUTIL_LOG_H |
| #define | AVUTIL_MACROS_H |
| #define | AVUTIL_MEM_H |
| #define | av_assert0(cond) assert(cond) |
| #define | av_malloc(s) malloc(s) |
| #define | av_mallocz(s) calloc(1, s) |
| #define | av_realloc(p, s) realloc(p, s) |
| #define | av_strdup(s) strdup(s) |
| #define | av_free(p) free(p) |
| #define | FFMAX(a, b) ((a) > (b) ? (a) : (b)) |
| #define | FFMIN(a, b) ((a) > (b) ? (b) : (a)) |
| #define | LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx) |
| #define | LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx) |
| #define | LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx) |
| #define | CMT(comment) rasm_annotate(r, comment) |
| #define | CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__) |
| #define | MAX_SAVED_REGS 10 |
| #define | SWIZZLE_TMP 0xf |
| #define | PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh) |
Variables | |
| static const SwsAArch64OpImplParams | impl_params [] |
| Implementation parameters for all exported functions. More... | |
| #define AVUTIL_AVASSERT_H |
This file is compiled as a standalone build-time tool and must not depend on internal FFmpeg libraries.
The necessary utils are redefined below using standard C equivalents.
Definition at line 39 of file ops_asmgen.c.
| #define AVUTIL_LOG_H |
Definition at line 40 of file ops_asmgen.c.
| #define AVUTIL_MACROS_H |
Definition at line 41 of file ops_asmgen.c.
| #define AVUTIL_MEM_H |
Definition at line 42 of file ops_asmgen.c.
Definition at line 43 of file ops_asmgen.c.
Definition at line 44 of file ops_asmgen.c.
Definition at line 45 of file ops_asmgen.c.
Definition at line 46 of file ops_asmgen.c.
Definition at line 47 of file ops_asmgen.c.
| #define av_free | ( | p | ) | free(p) |
Definition at line 48 of file ops_asmgen.c.
Definition at line 179 of file ops_asmgen.c.
Definition at line 180 of file ops_asmgen.c.
| #define LOOP_MASK_BWD_VH | ( | s, | |
| p, | |||
| idx | |||
| ) | if (s->use_vh) LOOP_MASK_BWD(p, idx) |
Definition at line 181 of file ops_asmgen.c.
| #define CMT | ( | comment | ) | rasm_annotate(r, comment) |
Definition at line 184 of file ops_asmgen.c.
| #define CMTF | ( | fmt, | |
| ... | |||
| ) | rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__) |
Definition at line 185 of file ops_asmgen.c.
| #define MAX_SAVED_REGS 10 |
Definition at line 264 of file ops_asmgen.c.
| #define SWIZZLE_TMP 0xf |
Definition at line 679 of file ops_asmgen.c.
| #define PRINT_SWIZZLE_V | ( | n, | |
| vh | |||
| ) | print_swizzle_v((char[8]){ 0 }, n, vh) |
Definition at line 689 of file ops_asmgen.c.
|
static |
Definition at line 52 of file ops_asmgen.c.
Referenced by av_dynarray2_add().
|
static |
Definition at line 65 of file ops_asmgen.c.
|
static |
Definition at line 99 of file ops_asmgen.c.
Referenced by asmgen_op_convert(), asmgen_op_cps(), asmgen_op_expand(), and asmgen_op_swap_bytes().
|
static |
Definition at line 113 of file ops_asmgen.c.
Referenced by aarch64_op_impl_func_name(), and aarch64_op_impl_lookup_str().
| void aarch64_op_impl_func_name | ( | char * | buf, |
| size_t | size, | ||
| const SwsAArch64OpImplParams * | params | ||
| ) |
Definition at line 125 of file ops_asmgen.c.
Referenced by asmgen_op_cps(), asmgen_process(), asmgen_process_return(), and lookup_gen().
|
static |
Definition at line 188 of file ops_asmgen.c.
Referenced by asmgen_op_cps(), asmgen_op_expand(), asmgen_op_pack(), and asmgen_op_unpack().
|
static |
Definition at line 215 of file ops_asmgen.c.
Referenced by asmgen_epilogue(), and asmgen_prologue().
|
static |
Definition at line 220 of file ops_asmgen.c.
Referenced by asmgen_process().
|
static |
Definition at line 241 of file ops_asmgen.c.
Referenced by asmgen_process_return().
Definition at line 266 of file ops_asmgen.c.
Referenced by clobbered_gprs().
|
static |
Definition at line 274 of file ops_asmgen.c.
Referenced by asmgen_process(), and asmgen_process_return().
|
static |
The process/process_return functions for aarch64 work similarly to the x86 backend. The description in x86/ops_common.asm mostly holds as well here.
Definition at line 288 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Definition at line 338 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Set node where the continuation address will be loaded and impl will be incremented.
This should be done right after impl->priv has been used.
Definition at line 387 of file ops_asmgen.c.
Referenced by asmgen_op_clear(), asmgen_op_cps(), asmgen_op_dither(), asmgen_op_linear(), asmgen_op_max(), asmgen_op_min(), asmgen_op_read_bit(), asmgen_op_scale(), and asmgen_op_write_bit().
|
static |
Definition at line 400 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 437 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 463 of file ops_asmgen.c.
Referenced by asmgen_op_read_packed().
|
static |
Definition at line 480 of file ops_asmgen.c.
Referenced by asmgen_op_read_packed().
|
static |
Definition at line 491 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 502 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 530 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 561 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 588 of file ops_asmgen.c.
Referenced by asmgen_op_write_packed().
|
static |
Definition at line 605 of file ops_asmgen.c.
Referenced by asmgen_op_write_packed().
|
static |
Definition at line 616 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 627 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 652 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 681 of file ops_asmgen.c.
|
static |
Definition at line 691 of file ops_asmgen.c.
Referenced by swizzle_emit().
|
static |
Definition at line 698 of file ops_asmgen.c.
Referenced by asmgen_op_swizzle().
|
static |
Definition at line 710 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
All-one values in movi only work up to 8-bit, and then at full 16- or 32-bit, but not for intermediate values like 10-bit. In those cases, we use mov + dup instead.
Definition at line 762 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 834 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 872 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 886 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
TODO
Definition at line 900 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Since each instruction in the convert operation needs specific element types, it is simpler to use arrangement specifiers for each operand instead of reshaping all vectors.
This function assumes block_size is either 8 or 16, and that we're always using the most amount of vector registers possible. Therefore, u32 always uses the high vector bank.
Definition at line 924 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1002 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1034 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1059 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Definition at line 1084 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Performs one pass of the linear transform over a single vector bank (low or high).
The intermediate registers for fmul+fadd (for when SWS_BITEXACT is set) start from temp vector 4.
Save rows that need to be used as input after they have been already written to.
The non-zero coefficients have been packed in aarch64_setup_linear() in sequential order into the individual lanes of the coefficient vector registers. We must follow the same order of execution here.
Split the multiply-accumulate into fmul+fadd. All multiplications are performed first into temporary registers, and only then added to the destination, to reduce the dependency chain. There is no need to perform multiplications by 1.
Most modern aarch64 cores have a fastpath for sequences of fmla instructions. This means that even if the coefficient is 1, it is still faster to use fmla by 1 instead of fadd.
Definition at line 1113 of file ops_asmgen.c.
Referenced by asmgen_op_linear().
|
static |
Definition at line 1199 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
For a description of the matrix buffer layout, read the comments in aarch64_setup_dither() in aarch64/ops.c.
Sort components by y_offset value so that we can start dithering with the smallest value, and increment the pointer upwards for each new offset. The dither matrix is over-allocated and may be over-read at the top, but it cannot be over-read before the start of the buffer. Since we only mask the y offset once, this would be an issue if we tried to subtract a value larger than the initial y_offset.
We use ubfiz to mask and shift left in one single instruction: ubfiz <Wd>, <Wn>, #<lsb>, #<width> Wd = (Wn & ((1 << width) - 1)) << lsb;
Given: block_size = 8, log2(block_size) = 3 dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111 sizeof(float) = 4, log2(sizeof(float)) = 2
Suppose we have bx = 0bvvvv. To get x, we left shift by log2(block_size) and end up with 0bvvvv000. Then we mask against dither_mask, and end up with 0bv000. Finally we multiply by sizeof(float), which is the same as shifting left by log2(sizeof(float)). The result is 0bv00000.
Therefore: width = log2(dither_size) - log2(block_size) lsb = log2(block_size) + log2(sizeof(float))
The ubfiz instruction for the y offset performs masking by the dither matrix size and shifts by the stride.
On subsequent runs, just increment the pointer. The matrix is over-allocated, so we don't risk overreading.
Definition at line 1244 of file ops_asmgen.c.
Referenced by asmgen_op_cps().
|
static |
Set up vector register dimensions and reshape all vectors accordingly.
Definition at line 1366 of file ops_asmgen.c.
Referenced by asmgen_op().
|
static |
Definition at line 1429 of file ops_asmgen.c.
Referenced by asmgen().
|
static |
Definition at line 1445 of file ops_asmgen.c.
Referenced by lookup_gen().
|
static |
The lookup function matches the SwsAArch64OpImplParams from ops_entries.c to the exported functions generated by asmgen_op(). Each call to aarch64_op_impl_lookup_str() generates a code fragment to uniquely detect the current function, opening and/or closing conditions depending on the parameters of the previous function.
Definition at line 1511 of file ops_asmgen.c.
Referenced by main().
|
static |
The entry point of the SwsOpFunc is the process function. The kernel functions are chained by directly branching to the next operation, using a continuation-passing style design. The exit point of the SwsOpFunc is the process_return function.
The GPRs used by the entire call-chain are listed below.
Function arguments are passed in r0-r5. After the parameters from exec have been read, r0 is reused to branch to the continuation functions. After the original parameters from impl have been computed, r1 is reused as the impl pointer for each operation.
Loop iterators are r6 for bx and r3 for y, reused from y_start, which doesn't need to be preserved.
The intra-procedure-call temporary registers (r16 and r17) are used as scratch registers. They may be used by call veneers and PLT code inserted by the linker, so we cannot expect them to persist across branches between functions.
The Platform Register (r18) is not used.
The read/write data pointers and padding values first use up the remaining free caller-saved registers, and only then are the caller-saved registers (r19-r28) used.
Definition at line 1553 of file ops_asmgen.c.
Referenced by main().
| int main | ( | int | argc, |
| char * | argv[] | ||
| ) |
Definition at line 1651 of file ops_asmgen.c.
|
static |
Implementation parameters for all exported functions.
This list is compiled by performing a dummy run of all conversions in sws_ops and collecting all functions that need to be generated. This is achieved by running: make sws_ops_entries_aarch64
Definition at line 93 of file ops_asmgen.c.
Referenced by asmgen(), and lookup_gen().
1.8.17