FFmpeg
rnd_template.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 #include "inline_asm.h"
31 
32 // put_pixels
33 av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
34  ptrdiff_t line_size, int h)
35 {
36  MOVQ_ZERO(mm7);
37  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
38  __asm__ volatile(
39  "movq (%1), %%mm0 \n\t"
40  "movq 1(%1), %%mm4 \n\t"
41  "movq %%mm0, %%mm1 \n\t"
42  "movq %%mm4, %%mm5 \n\t"
43  "punpcklbw %%mm7, %%mm0 \n\t"
44  "punpcklbw %%mm7, %%mm4 \n\t"
45  "punpckhbw %%mm7, %%mm1 \n\t"
46  "punpckhbw %%mm7, %%mm5 \n\t"
47  "paddusw %%mm0, %%mm4 \n\t"
48  "paddusw %%mm1, %%mm5 \n\t"
49  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
50  "add %3, %1 \n\t"
51  ".p2align 3 \n\t"
52  "1: \n\t"
53  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
54  "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
55  "movq %%mm0, %%mm1 \n\t"
56  "movq %%mm2, %%mm3 \n\t"
57  "punpcklbw %%mm7, %%mm0 \n\t"
58  "punpcklbw %%mm7, %%mm2 \n\t"
59  "punpckhbw %%mm7, %%mm1 \n\t"
60  "punpckhbw %%mm7, %%mm3 \n\t"
61  "paddusw %%mm2, %%mm0 \n\t"
62  "paddusw %%mm3, %%mm1 \n\t"
63  "paddusw %%mm6, %%mm4 \n\t"
64  "paddusw %%mm6, %%mm5 \n\t"
65  "paddusw %%mm0, %%mm4 \n\t"
66  "paddusw %%mm1, %%mm5 \n\t"
67  "psrlw $2, %%mm4 \n\t"
68  "psrlw $2, %%mm5 \n\t"
69  "packuswb %%mm5, %%mm4 \n\t"
70  "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
71  "add %3, %%"FF_REG_a" \n\t"
72 
73  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
74  "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
75  "movq %%mm2, %%mm3 \n\t"
76  "movq %%mm4, %%mm5 \n\t"
77  "punpcklbw %%mm7, %%mm2 \n\t"
78  "punpcklbw %%mm7, %%mm4 \n\t"
79  "punpckhbw %%mm7, %%mm3 \n\t"
80  "punpckhbw %%mm7, %%mm5 \n\t"
81  "paddusw %%mm2, %%mm4 \n\t"
82  "paddusw %%mm3, %%mm5 \n\t"
83  "paddusw %%mm6, %%mm0 \n\t"
84  "paddusw %%mm6, %%mm1 \n\t"
85  "paddusw %%mm4, %%mm0 \n\t"
86  "paddusw %%mm5, %%mm1 \n\t"
87  "psrlw $2, %%mm0 \n\t"
88  "psrlw $2, %%mm1 \n\t"
89  "packuswb %%mm1, %%mm0 \n\t"
90  "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
91  "add %3, %%"FF_REG_a" \n\t"
92 
93  "subl $2, %0 \n\t"
94  "jnz 1b \n\t"
95  :"+g"(h), "+S"(pixels)
96  :"D"(block), "r"((x86_reg)line_size)
97  :FF_REG_a, "memory");
98 }
99 
100 #ifndef NO_AVG
101 // avg_pixels
102 // this routine is 'slightly' suboptimal but mostly unused
103 av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
104  ptrdiff_t line_size, int h)
105 {
106  MOVQ_ZERO(mm7);
107  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
108  __asm__ volatile(
109  "movq (%1), %%mm0 \n\t"
110  "movq 1(%1), %%mm4 \n\t"
111  "movq %%mm0, %%mm1 \n\t"
112  "movq %%mm4, %%mm5 \n\t"
113  "punpcklbw %%mm7, %%mm0 \n\t"
114  "punpcklbw %%mm7, %%mm4 \n\t"
115  "punpckhbw %%mm7, %%mm1 \n\t"
116  "punpckhbw %%mm7, %%mm5 \n\t"
117  "paddusw %%mm0, %%mm4 \n\t"
118  "paddusw %%mm1, %%mm5 \n\t"
119  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
120  "add %3, %1 \n\t"
121  ".p2align 3 \n\t"
122  "1: \n\t"
123  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
124  "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
125  "movq %%mm0, %%mm1 \n\t"
126  "movq %%mm2, %%mm3 \n\t"
127  "punpcklbw %%mm7, %%mm0 \n\t"
128  "punpcklbw %%mm7, %%mm2 \n\t"
129  "punpckhbw %%mm7, %%mm1 \n\t"
130  "punpckhbw %%mm7, %%mm3 \n\t"
131  "paddusw %%mm2, %%mm0 \n\t"
132  "paddusw %%mm3, %%mm1 \n\t"
133  "paddusw %%mm6, %%mm4 \n\t"
134  "paddusw %%mm6, %%mm5 \n\t"
135  "paddusw %%mm0, %%mm4 \n\t"
136  "paddusw %%mm1, %%mm5 \n\t"
137  "psrlw $2, %%mm4 \n\t"
138  "psrlw $2, %%mm5 \n\t"
139  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
140  "packuswb %%mm5, %%mm4 \n\t"
141  "pcmpeqd %%mm2, %%mm2 \n\t"
142  "paddb %%mm2, %%mm2 \n\t"
143  PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
144  "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
145  "add %3, %%"FF_REG_a" \n\t"
146 
147  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
148  "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
149  "movq %%mm2, %%mm3 \n\t"
150  "movq %%mm4, %%mm5 \n\t"
151  "punpcklbw %%mm7, %%mm2 \n\t"
152  "punpcklbw %%mm7, %%mm4 \n\t"
153  "punpckhbw %%mm7, %%mm3 \n\t"
154  "punpckhbw %%mm7, %%mm5 \n\t"
155  "paddusw %%mm2, %%mm4 \n\t"
156  "paddusw %%mm3, %%mm5 \n\t"
157  "paddusw %%mm6, %%mm0 \n\t"
158  "paddusw %%mm6, %%mm1 \n\t"
159  "paddusw %%mm4, %%mm0 \n\t"
160  "paddusw %%mm5, %%mm1 \n\t"
161  "psrlw $2, %%mm0 \n\t"
162  "psrlw $2, %%mm1 \n\t"
163  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
164  "packuswb %%mm1, %%mm0 \n\t"
165  "pcmpeqd %%mm2, %%mm2 \n\t"
166  "paddb %%mm2, %%mm2 \n\t"
167  PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
168  "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
169  "add %3, %%"FF_REG_a" \n\t"
170 
171  "subl $2, %0 \n\t"
172  "jnz 1b \n\t"
173  :"+g"(h), "+S"(pixels)
174  :"D"(block), "r"((x86_reg)line_size)
175  :FF_REG_a, "memory");
176 }
177 #endif
PAVGB_MMX
#define PAVGB_MMX(rega, regb, regr, regfe)
Definition: inline_asm.h:63
inline_asm.h
x86_reg
int x86_reg
Definition: asm.h:72
av_unused
#define av_unused
Definition: attributes.h:131
MOVQ_ZERO
#define MOVQ_ZERO(regd)
Definition: inline_asm.h:32
pixels8_xy2
av_unused STATIC void put_TMPL pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: rnd_template.c:33
DEF
#define DEF(type, name, bytes, read, write)
Definition: bytestream.h:42
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
STATIC
#define STATIC
Definition: vf_libplacebo.c:1274
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2070