FFmpeg
vp8dsp_altivec.c
Go to the documentation of this file.
1 /*
2  * VP8 compatible video decoder
3  *
4  * Copyright (C) 2010 David Conrad
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/ppc/cpu.h"
30 
31 #include "libavcodec/vp8dsp.h"
32 
33 #include "hpeldsp_altivec.h"
34 
35 #if HAVE_ALTIVEC
36 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
37 
38 // h subpel filter uses msum to multiply+add 4 pixel taps at once
39 static const vec_s8 h_subpel_filters_inner[7] =
40 {
41  REPT4( -6, 123, 12, -1),
42  REPT4(-11, 108, 36, -8),
43  REPT4( -9, 93, 50, -6),
44  REPT4(-16, 77, 77, -16),
45  REPT4( -6, 50, 93, -9),
46  REPT4( -8, 36, 108, -11),
47  REPT4( -1, 12, 123, -6),
48 };
49 
50 // for 6tap filters, these are the outer two taps
51 // The zeros mask off pixels 4-7 when filtering 0-3
52 // and vice-versa
53 static const vec_s8 h_subpel_filters_outer[4] =
54 {
55  REPT4(0, 0, 2, 1),
56  REPT4(0, 0, 3, 3),
57  REPT4(0, 0, 1, 2),
58  REPT4(0, 0, 0, 0),
59 };
60 
61 #define LOAD_H_SUBPEL_FILTER(i) \
62  vec_s8 filter_inner = h_subpel_filters_inner[i]; \
63  vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
64  vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
65 
66 #if HAVE_BIGENDIAN
67 #define GET_PIXHL(offset) \
68  a = vec_ld((offset)-is6tap-1, src); \
69  b = vec_ld((offset)-is6tap-1+15, src); \
70  pixh = vec_perm(a, b, permh##offset); \
71  pixl = vec_perm(a, b, perml##offset)
72 
73 #define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
74 #else
75 #define GET_PIXHL(offset) \
76  a = vec_vsx_ld((offset)-is6tap-1, src); \
77  pixh = vec_perm(a, a, perm_inner); \
78  pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
79 
80 #define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
81 #endif
82 
83 #define FILTER_H(dstv, off) \
84  GET_PIXHL(off); \
85  filth = vec_msum(filter_inner, pixh, c64); \
86  filtl = vec_msum(filter_inner, pixl, c64); \
87 \
88  if (is6tap) { \
89  GET_OUTER(off); \
90  filth = vec_msum(filter_outerh, outer, filth); \
91  filtl = vec_msum(filter_outerl, outer, filtl); \
92  } \
93  if (w == 4) \
94  filtl = filth; /* discard pixels 4-7 */ \
95  dstv = vec_packs(filth, filtl); \
96  dstv = vec_sra(dstv, c7)
97 
98 static av_always_inline
99 void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
100  const uint8_t *src, ptrdiff_t src_stride,
101  int h, int mx, int w, int is6tap)
102 {
103  LOAD_H_SUBPEL_FILTER(mx-1);
104 #if HAVE_BIGENDIAN
105  vec_u8 align_vec0, align_vec8, permh0, permh8;
106  vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
107  vec_u8 b;
108 #endif
109  vec_u8 filt, a, pixh, pixl, outer;
110  vec_s16 f16h, f16l;
111  vec_s32 filth, filtl;
112 
113  vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
114  vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
115  vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
116  vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
117  vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
118  vec_u16 c7 = vec_splat_u16(7);
119 
120 #if HAVE_BIGENDIAN
121  align_vec0 = vec_lvsl( -is6tap-1, src);
122  align_vec8 = vec_lvsl(8-is6tap-1, src);
123 
124  permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
125  permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
126  perm_inner = vec_add(perm_inner, vec_splat_u8(4));
127  perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
128  perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
129  perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
130  perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
131 #endif
132 
133  while (h --> 0) {
134  FILTER_H(f16h, 0);
135 
136  if (w == 16) {
137  FILTER_H(f16l, 8);
138  filt = vec_packsu(f16h, f16l);
139  vec_st(filt, 0, dst);
140  } else {
141  filt = vec_packsu(f16h, f16h);
142  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
143  if (w == 8)
144  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
145  }
146  src += src_stride;
147  dst += dst_stride;
148  }
149 }
150 
151 // v subpel filter does a simple vertical multiply + add
152 static const vec_u8 v_subpel_filters[7] =
153 {
154  { 0, 6, 123, 12, 1, 0 },
155  { 2, 11, 108, 36, 8, 1 },
156  { 0, 9, 93, 50, 6, 0 },
157  { 3, 16, 77, 77, 16, 3 },
158  { 0, 6, 50, 93, 9, 0 },
159  { 1, 8, 36, 108, 11, 2 },
160  { 0, 1, 12, 123, 6, 0 },
161 };
162 
163 #define LOAD_V_SUBPEL_FILTER(i) \
164  vec_u8 subpel_filter = v_subpel_filters[i]; \
165  vec_u8 f0 = vec_splat(subpel_filter, 0); \
166  vec_u8 f1 = vec_splat(subpel_filter, 1); \
167  vec_u8 f2 = vec_splat(subpel_filter, 2); \
168  vec_u8 f3 = vec_splat(subpel_filter, 3); \
169  vec_u8 f4 = vec_splat(subpel_filter, 4); \
170  vec_u8 f5 = vec_splat(subpel_filter, 5)
171 
172 #define FILTER_V(dstv, vec_mul) \
173  s1f = (vec_s16)vec_mul(s1, f1); \
174  s2f = (vec_s16)vec_mul(s2, f2); \
175  s3f = (vec_s16)vec_mul(s3, f3); \
176  s4f = (vec_s16)vec_mul(s4, f4); \
177  s2f = vec_subs(s2f, s1f); \
178  s3f = vec_subs(s3f, s4f); \
179  if (is6tap) { \
180  s0f = (vec_s16)vec_mul(s0, f0); \
181  s5f = (vec_s16)vec_mul(s5, f5); \
182  s2f = vec_adds(s2f, s0f); \
183  s3f = vec_adds(s3f, s5f); \
184  } \
185  dstv = vec_adds(s2f, s3f); \
186  dstv = vec_adds(dstv, c64); \
187  dstv = vec_sra(dstv, c7)
188 
189 #if HAVE_BIGENDIAN
190 #define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
191 #else
192 #define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
193 #endif
194 
195 static av_always_inline
196 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
197  const uint8_t *src, ptrdiff_t src_stride,
198  int h, int my, int w, int is6tap)
199 {
200  LOAD_V_SUBPEL_FILTER(my-1);
201  vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
202  vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
203  vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
204  vec_u16 c7 = vec_splat_u16(7);
205 
206 #if HAVE_BIGENDIAN
207  // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
208  // so combine this permute with the alignment permute vector
209  align_vech = vec_lvsl(0, src);
210  align_vecl = vec_sld(align_vech, align_vech, 8);
211  if (w ==16)
212  perm_vec = vec_mergeh(align_vech, align_vecl);
213  else
214  perm_vec = vec_mergeh(align_vech, align_vech);
215 #endif
216 
217  if (is6tap)
218  s0 = LOAD_HL(-2*src_stride, src, perm_vec);
219  s1 = LOAD_HL(-1*src_stride, src, perm_vec);
220  s2 = LOAD_HL( 0*src_stride, src, perm_vec);
221  s3 = LOAD_HL( 1*src_stride, src, perm_vec);
222  if (is6tap)
223  s4 = LOAD_HL( 2*src_stride, src, perm_vec);
224 
225  src += (2+is6tap)*src_stride;
226 
227  while (h --> 0) {
228  if (is6tap)
229  s5 = LOAD_HL(0, src, perm_vec);
230  else
231  s4 = LOAD_HL(0, src, perm_vec);
232 
233  FILTER_V(f16h, vec_mule);
234 
235  if (w == 16) {
236  FILTER_V(f16l, vec_mulo);
237  filt = vec_packsu(f16h, f16l);
238  vec_st(filt, 0, dst);
239  } else {
240  filt = vec_packsu(f16h, f16h);
241  if (w == 4)
242  filt = (vec_u8)vec_splat((vec_u32)filt, 0);
243  else
244  vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
245  vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
246  }
247 
248  if (is6tap)
249  s0 = s1;
250  s1 = s2;
251  s2 = s3;
252  s3 = s4;
253  if (is6tap)
254  s4 = s5;
255 
256  dst += dst_stride;
257  src += src_stride;
258  }
259 }
260 
261 #define EPEL_FUNCS(WIDTH, TAPS) \
262 static av_noinline \
263 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
264 { \
265  put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
266 } \
267 \
268 static av_noinline \
269 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
270 { \
271  put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
272 }
273 
274 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \
275 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, const uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
276 { \
277  DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
278  if (VTAPS == 6) { \
279  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
280  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
281  } else { \
282  put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
283  put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
284  } \
285 }
286 
287 EPEL_FUNCS(16,6)
288 EPEL_FUNCS(8, 6)
289 EPEL_FUNCS(8, 4)
290 EPEL_FUNCS(4, 6)
291 EPEL_FUNCS(4, 4)
292 
293 EPEL_HV(16, 6,6)
294 EPEL_HV(8, 6,6)
295 EPEL_HV(8, 4,6)
296 EPEL_HV(8, 6,4)
297 EPEL_HV(8, 4,4)
298 EPEL_HV(4, 6,6)
299 EPEL_HV(4, 4,6)
300 EPEL_HV(4, 6,4)
301 EPEL_HV(4, 4,4)
302 
303 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, const uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
304 {
305  register vector unsigned char perm;
306  int i;
307  register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
308  register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
309  register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
310 
311 #if HAVE_BIGENDIAN
312  perm = vec_lvsl(0, src);
313 #endif
314 // hand-unrolling the loop by 4 gains about 15%
315 // mininum execution time goes from 74 to 60 cycles
316 // it's faster than -funroll-loops, but using
317 // -funroll-loops w/ this is bad - 74 cycles again.
318 // all this is on a 7450, tuning for the 7450
319  for (i = 0; i < h; i += 4) {
320  vec_st(load_with_perm_vec(0, src, perm), 0, dst);
321  vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
322  vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
323  vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
324  src += sstride4;
325  dst += dstride4;
326  }
327 }
328 
329 #endif /* HAVE_ALTIVEC */
330 
331 
333 {
334 #if HAVE_ALTIVEC
336  return;
337 
338  c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
339  c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
340  c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
341  c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
342 
343  c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
344  c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
345  c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
346  c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
347 
348  c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
349  c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
350  c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
351  c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
352 
353  c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
354  c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
355  c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
356  c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
357 
358  c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
359  c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
360  c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
361  c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
362 #endif /* HAVE_ALTIVEC */
363 }
mem_internal.h
vec_s8
#define vec_s8
Definition: util_altivec.h:35
w
uint8_t w
Definition: llviddspenc.c:38
b
#define b
Definition: input.c:41
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:107
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:53
perm
perm
Definition: f_perms.c:75
vec_s32
#define vec_s32
Definition: util_altivec.h:39
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
vp8dsp.h
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:53
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
VP8DSPContext
Definition: vp8dsp.h:37
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
vec_u32
#define vec_u32
Definition: util_altivec.h:38
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
av_always_inline
#define av_always_inline
Definition: attributes.h:49
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:40
hpeldsp_altivec.h
EPEL_FUNCS
#define EPEL_FUNCS(depth)
ff_vp78dsp_init_ppc
av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
Definition: vp8dsp_altivec.c:332
util_altivec.h
cpu.h
h
h
Definition: vp9dsp_template.c:2070
src
#define src
Definition: vp8dsp.c:248
vec_u16
#define vec_u16
Definition: util_altivec.h:36