FFmpeg
mpeg4videodsp.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #include "config.h"
20 #include "libavutil/attributes.h"
21 #include "libavutil/cpu.h"
22 #include "libavutil/mem_internal.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
26 #include "videodsp.h"
27 
28 #if HAVE_SSSE3_INLINE
29 
30 #define SPLATW(reg) "pshuflw $0, %%" #reg ", %%" #reg "\n\t" \
31  "punpcklqdq %%" #reg ", %%" #reg "\n\t"
32 
33 typedef struct {
34  DECLARE_ALIGNED_16(uint16_t, u16)[8];
35 } xmm_u16;
36 
37 DECLARE_ASM_CONST(16, xmm_u16, pw_0to7) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
38 
39 static void gmc_ssse3(uint8_t *dst, const uint8_t *src,
40  int stride, int h, int ox, int oy,
41  int dxx, int dxy, int dyx, int dyy,
42  int shift, int r, int width, int height)
43 {
44  enum {
45  W = 8,
46  EDGE_EMU_STRIDE = 16, //< anything >= W+1 will do
47  MAX_H = 16,
48  };
49  const int w = 8;
50  const int ix = ox >> (16 + shift);
51  const int iy = oy >> (16 + shift);
52  const int ox2 = ox & (1 << (16 + shift)) - 1;
53  const int oy2 = oy & (1 << (16 + shift)) - 1;
54  const int oxs = ox2 >> 4;
55  const int oys = oy2 >> 4;
56  const int dxx2 = dxx - (1 << (16 + shift));
57  const int dyy2 = dyy - (1 << (16 + shift));
58  const int dxxs = dxx2 >> 4;
59  const int dxys = dxy >> 4;
60  const int dyxs = dyx >> 4;
61  const int dyys = dyy2 >> 4;
62 
63  const int dxw = dxx2 * (w - 1);
64  const int dyh = dyy2 * (h - 1);
65  const int dxh = dxy * (h - 1);
66  const int dyw = dyx * (w - 1);
67  int need_emu = (unsigned) ix >= width - w || width < w ||
68  (unsigned) iy >= height - h || height< h
69  ;
70 
71  if ( // non-constant fullpel offset (3% of blocks)
72  ((ox2 + dxw) | (ox2 + dxh) | (ox2 + dxw + dxh) |
73  (oy2 + dyw) | (oy2 + dyh) | (oy2 + dyw + dyh)) >> (16 + shift) ||
74  // uses more than 16 bits of subpel mv (only at huge resolution)
75  (dxx | dxy | dyx | dyy) & 15 ||
76  (!HAVE_SSE2_EXTERNAL && need_emu)) {
77  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
78  shift, r, width, height);
79  return;
80  }
81 
82  src += ix + iy * stride;
83  const ptrdiff_t dst_stride = stride;
84  ptrdiff_t src_stride = stride;
85 #if HAVE_SSE2_EXTERNAL
86  uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE];
87  if (need_emu) {
88  ff_emulated_edge_mc_sse2(edge_buf, src, EDGE_EMU_STRIDE, src_stride,
89  w + 1, h + 1, ix, iy, width, height);
90  src = edge_buf;
91  src_stride = EDGE_EMU_STRIDE;
92  }
93 #endif
94 
95 #if ARCH_X86_32
96  xmm_u16 dxy8, dyy8, r8;
97  DECLARE_ALIGNED_16(uint64_t, shift2) = 2 * shift;
98 #endif
99 
100  __asm__ volatile (
101  "movd %[dxxs], %%xmm2 \n\t"
102  "movd %[dyxs], %%xmm3 \n\t"
103  "movd %[oxs], %%xmm1 \n\t"
104  SPLATW(xmm2)
105  "movd %[oys], %%xmm7 \n\t"
106  SPLATW(xmm3)
107  "pmullw "MANGLE(pw_0to7)", %%xmm2 \n\t"
108  SPLATW(xmm1)
109  "movd %[s], %%xmm6 \n\t"
110  "pmullw "MANGLE(pw_0to7)", %%xmm3 \n\t"
111  "movq (%[src]), %%xmm5 \n\t"
112  SPLATW(xmm7)
113 #if ARCH_X86_32
114  "movd %[dxys], %%xmm0 \n\t"
115 #else
116  "movd %[dxys], %%xmm11 \n\t"
117 #endif
118  "paddw %%xmm2, %%xmm1 \n\t"
119  "movq 1(%[src]), %%xmm2 \n\t"
120  SPLATW(xmm6)
121 #if ARCH_X86_32
122  "movd %[dyys], %%xmm4 \n\t"
123 #else
124  "movd %[dyys], %%xmm9 \n\t"
125 #endif
126  "paddw %%xmm3, %%xmm7 \n\t"
127  "punpcklbw %%xmm2, %%xmm5 \n\t"
128 #if ARCH_X86_32
129  SPLATW(xmm0)
130  "movd %[r], %%xmm2 \n\t"
131  SPLATW(xmm4)
132  "movdqa %%xmm0, %[dxy8] \n\t"
133  SPLATW(xmm2)
134  "movdqa %%xmm4, %[dyy8] \n\t"
135  "movdqa %%xmm2, %[r8] \n\t"
136 #else
137  SPLATW(xmm11)
138  "movd %[r], %%xmm8 \n\t"
139  SPLATW(xmm9)
140  SPLATW(xmm8)
141  "movd %[shift2], %%xmm12 \n\t"
142 #endif
143 
144  "1: \n\t"
145  "add %[src_stride], %[src] \n\t"
146  "movq (%[src]), %%xmm3 \n\t"
147  "movq 1(%[src]), %%xmm0 \n\t"
148  "movdqa %%xmm1, %%xmm4 \n\t"
149  "psrlw $12, %%xmm4 \n\t" // dx
150  "movdqa %%xmm6, %%xmm2 \n\t"
151  "psubw %%xmm4, %%xmm2 \n\t" // (s-dx)
152  "psllw $8, %%xmm4 \n\t"
153  "por %%xmm4, %%xmm2 \n\t" // s-dx,dx,s-dx,dx (bytes)
154  "pmaddubsw %%xmm2, %%xmm5 \n\t" // src[0, 0] * (s - dx) + src[1,0] * dx
155  "punpcklbw %%xmm0, %%xmm3 \n\t"
156  "movdqa %%xmm3, %%xmm0 \n\t"
157  "pmaddubsw %%xmm2, %%xmm3 \n\t" // src[0, 1] * (s - dx) + src[1,1] * dx
158 #if ARCH_X86_32
159  "paddw %[dxy8], %%xmm1 \n\t"
160 #else
161  "paddw %%xmm11, %%xmm1 \n\t"
162 #endif
163  "movdqa %%xmm7, %%xmm4 \n\t"
164  "movdqa %%xmm6, %%xmm2 \n\t"
165  "psrlw $12, %%xmm4 \n\t" // dy
166  "psubw %%xmm4, %%xmm2 \n\t" // (s-dy)
167  "pmullw %%xmm5, %%xmm2 \n\t" // (src[0, 0] * (s - dx) + src[1,0] * dx) * (s - dy)
168 #if ARCH_X86_32
169  "paddw %[dyy8], %%xmm7 \n\t"
170 #else
171  "paddw %%xmm9, %%xmm7 \n\t"
172 #endif
173  "pmullw %%xmm3, %%xmm4 \n\t" // (src[0, 1] * (s - dx) + src[1,1] * dx) * dy
174 
175 #if ARCH_X86_32
176  "paddw %[r8], %%xmm2 \n\t"
177 #else
178  "paddw %%xmm8, %%xmm2 \n\t"
179 #endif
180  "paddw %%xmm2, %%xmm4 \n\t"
181 
182 #if ARCH_X86_32
183  "psrlw %[shift2], %%xmm4 \n\t"
184 #else
185  "psrlw %%xmm12, %%xmm4 \n\t"
186 #endif
187  "packuswb %%xmm4, %%xmm4 \n\t"
188  "movq %%xmm4, (%[dst]) \n\t"
189  "movdqa %%xmm0, %%xmm5 \n\t"
190  "add %[dst_stride], %[dst] \n\t"
191 
192  "decl %[h] \n\t"
193  "jnz 1b \n\t"
194  : [dst]"+r"(dst), [src]"+r"(src),
195 #if HAVE_6REGS || HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
196  [h]"+r"(h)
197 #else
198  [h]"+m"(h)
199 #endif
200 #if ARCH_X86_32
201  , [dxy8]"=m" (dxy8), [dyy8]"=m" (dyy8), [r8]"=m" (r8)
202 #endif
203  : [dst_stride]"r"(dst_stride), [src_stride]"r"(src_stride),
204  [s]"g" (1 << shift),
205 #if ARCH_X86_32
206  [shift2]"m" (shift2),
207 #else
208  [shift2]"g" (2*shift),
209 #endif
210  [oxs]"g"(oxs), [oys]"g"(oys), [dxxs]"g"(dxxs), [dyxs]"g"(dyxs),
211  [dxys]"g"(dxys), [dyys]"g"(dyys), [r]"g"(r) NAMED_CONSTRAINTS_ADD(pw_0to7)
212  : XMM_CLOBBERS("xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",)
213 #if ARCH_X86_64
214  XMM_CLOBBERS("xmm8", "xmm9", "xmm10", "xmm11", "xmm12",)
215 #endif
216  "memory");
217 }
218 
219 #endif /* HAVE_SSSE3_INLINE */
220 
222 {
223 #if HAVE_SSSE3_INLINE
224  int cpu_flags = av_get_cpu_flags();
225 
226  if (INLINE_SSSE3(cpu_flags))
227  c->gmc = gmc_ssse3;
228 #endif /* HAVE_SSSE3_INLINE */
229 }
W
@ W
Definition: mpeg4videodsp.c:32
cpu.h
r
const char * r
Definition: vf_curves.c:127
ff_mpeg4videodsp_init_x86
av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c)
Definition: mpeg4videodsp.c:221
mem_internal.h
videodsp.h
Mpeg4VideoDSPContext
Definition: mpeg4videodsp.h:28
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
ff_emulated_edge_mc_sse2
void ff_emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, int block_w, int block_h, int src_x, int src_y, int w, int h)
Definition: videodsp_init.c:191
MANGLE
#define MANGLE(a)
Definition: asm.h:126
av_cold
#define av_cold
Definition: attributes.h:111
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:144
s
#define s(width, name)
Definition: cbs_vp9.c:198
asm.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
height
#define height
Definition: dsp.h:89
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
cpu.h
attributes.h
DECLARE_ASM_CONST
DECLARE_ASM_CONST(16, double, pd_1)[2]
shift2
static const uint8_t shift2[6]
Definition: dxa.c:49
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:97
INLINE_SSSE3
#define INLINE_SSSE3(flags)
Definition: cpu.h:89
mpeg4videodsp.h
w
uint8_t w
Definition: llvidencdsp.c:39
ff_gmc_c
void ff_gmc_c(uint8_t *dst, const uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
Definition: mpeg4videodsp.c:47
DECLARE_ALIGNED_16
#define DECLARE_ALIGNED_16(t, v)
Definition: mem_internal.h:112
h
h
Definition: vp9dsp_template.c:2070
stride
#define stride
Definition: h264pred_template.c:536
width
#define width
Definition: dsp.h:89
src
#define src
Definition: vp8dsp.c:248