36 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } 
   39 static const vec_s8 h_subpel_filters_inner[7] =
 
   41     REPT4( -6, 123,  12,  -1),
 
   42     REPT4(-11, 108,  36,  -8),
 
   43     REPT4( -9,  93,  50,  -6),
 
   44     REPT4(-16,  77,  77, -16),
 
   45     REPT4( -6,  50,  93,  -9),
 
   46     REPT4( -8,  36, 108, -11),
 
   47     REPT4( -1,  12, 123,  -6),
 
   53 static const vec_s8 h_subpel_filters_outer[4] =
 
   61 #define LOAD_H_SUBPEL_FILTER(i) \ 
   62     vec_s8 filter_inner  = h_subpel_filters_inner[i]; \ 
   63     vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ 
   64     vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) 
   67 #define GET_PIXHL(offset)                   \ 
   68     a = vec_ld((offset)-is6tap-1, src);     \ 
   69     b = vec_ld((offset)-is6tap-1+15, src);  \ 
   70     pixh  = vec_perm(a, b, permh##offset);  \ 
   71     pixl  = vec_perm(a, b, perml##offset) 
   73 #define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset) 
   75 #define GET_PIXHL(offset)                   \ 
   76     a = vec_vsx_ld((offset)-is6tap-1, src); \ 
   77     pixh  = vec_perm(a, a, perm_inner);     \ 
   78     pixl  = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4))) 
   80 #define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer) 
   83 #define FILTER_H(dstv, off) \ 
   85     filth = vec_msum(filter_inner, pixh, c64); \ 
   86     filtl = vec_msum(filter_inner, pixl, c64); \ 
   90         filth = vec_msum(filter_outerh, outer, filth); \ 
   91         filtl = vec_msum(filter_outerl, outer, filtl); \ 
   95     dstv = vec_packs(filth, filtl); \ 
   96     dstv = vec_sra(dstv, c7) 
   99 void put_vp8_epel_h_altivec_core(uint8_t *
dst, ptrdiff_t dst_stride,
 
  100                                  const uint8_t *
src, ptrdiff_t src_stride,
 
  101                                  int h, 
int mx, 
int w, 
int is6tap)
 
  103     LOAD_H_SUBPEL_FILTER(
mx-1);
 
  105     vec_u8 align_vec0, align_vec8, permh0, permh8;
 
  106     vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
 
  113     vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
 
  114     vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
 
  115     vec_u8 perm_inner  = is6tap ? perm_inner6 : perm_inner4;
 
  116     vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
 
  117     vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
 
  121     align_vec0 = vec_lvsl( -is6tap-1, 
src);
 
  122     align_vec8 = vec_lvsl(8-is6tap-1, 
src);
 
  124     permh0     = vec_perm(align_vec0, align_vec0, perm_inner);
 
  125     permh8     = vec_perm(align_vec8, align_vec8, perm_inner);
 
  126     perm_inner = vec_add(perm_inner, vec_splat_u8(4));
 
  127     perml0     = vec_perm(align_vec0, align_vec0, perm_inner);
 
  128     perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
 
  129     perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
 
  130     perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
 
  138             filt = vec_packsu(f16h, f16l);
 
  141             filt = vec_packsu(f16h, f16h);
 
  152 static const vec_u8 v_subpel_filters[7] =
 
  154     { 0,   6, 123,  12,   1,   0 },
 
  155     { 2,  11, 108,  36,   8,   1 },
 
  156     { 0,   9,  93,  50,   6,   0 },
 
  157     { 3,  16,  77,  77,  16,   3 },
 
  158     { 0,   6,  50,  93,   9,   0 },
 
  159     { 1,   8,  36, 108,  11,   2 },
 
  160     { 0,   1,  12, 123,   6,   0 },
 
  163 #define LOAD_V_SUBPEL_FILTER(i) \ 
  164     vec_u8 subpel_filter = v_subpel_filters[i]; \ 
  165     vec_u8 f0 = vec_splat(subpel_filter, 0); \ 
  166     vec_u8 f1 = vec_splat(subpel_filter, 1); \ 
  167     vec_u8 f2 = vec_splat(subpel_filter, 2); \ 
  168     vec_u8 f3 = vec_splat(subpel_filter, 3); \ 
  169     vec_u8 f4 = vec_splat(subpel_filter, 4); \ 
  170     vec_u8 f5 = vec_splat(subpel_filter, 5) 
  172 #define FILTER_V(dstv, vec_mul) \ 
  173     s1f = (vec_s16)vec_mul(s1, f1); \ 
  174     s2f = (vec_s16)vec_mul(s2, f2); \ 
  175     s3f = (vec_s16)vec_mul(s3, f3); \ 
  176     s4f = (vec_s16)vec_mul(s4, f4); \ 
  177     s2f = vec_subs(s2f, s1f); \ 
  178     s3f = vec_subs(s3f, s4f); \ 
  180         s0f = (vec_s16)vec_mul(s0, f0); \ 
  181         s5f = (vec_s16)vec_mul(s5, f5); \ 
  182         s2f = vec_adds(s2f, s0f); \ 
  183         s3f = vec_adds(s3f, s5f); \ 
  185     dstv = vec_adds(s2f, s3f); \ 
  186     dstv = vec_adds(dstv, c64); \ 
  187     dstv = vec_sra(dstv, c7) 
  190 #define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm) 
  192 #define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s)) 
  196 void put_vp8_epel_v_altivec_core(uint8_t *
dst, ptrdiff_t dst_stride,
 
  197                                  const uint8_t *
src, ptrdiff_t src_stride,
 
  198                                  int h, 
int my, 
int w, 
int is6tap)
 
  200     LOAD_V_SUBPEL_FILTER(
my-1);
 
  201     vec_u8 s0, s1, s2, s3, s4, s5, 
filt, align_vech, perm_vec, align_vecl;
 
  202     vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
 
  203     vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
 
  209     align_vech = vec_lvsl(0, 
src);
 
  210     align_vecl = vec_sld(align_vech, align_vech, 8);
 
  212         perm_vec = vec_mergeh(align_vech, align_vecl);
 
  214         perm_vec = vec_mergeh(align_vech, align_vech);
 
  218         s0 = LOAD_HL(-2*src_stride, 
src, perm_vec);
 
  219     s1 = LOAD_HL(-1*src_stride, 
src, perm_vec);
 
  220     s2 = LOAD_HL( 0*src_stride, 
src, perm_vec);
 
  221     s3 = LOAD_HL( 1*src_stride, 
src, perm_vec);
 
  223         s4 = LOAD_HL( 2*src_stride, 
src, perm_vec);
 
  225     src += (2+is6tap)*src_stride;
 
  229             s5 = LOAD_HL(0, 
src, perm_vec);
 
  231             s4 = LOAD_HL(0, 
src, perm_vec);
 
  233         FILTER_V(f16h, vec_mule);
 
  236             FILTER_V(f16l, vec_mulo);
 
  237             filt = vec_packsu(f16h, f16l);
 
  240             filt = vec_packsu(f16h, f16h);
 
  261 #define EPEL_FUNCS(WIDTH, TAPS) \ 
  263 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 
  265     put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ 
  269 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 
  271     put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ 
  274 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \ 
  275 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, const uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ 
  277     DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ 
  279         put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-2*sstride, sstride, h+5, mx, my); \ 
  280         put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16,      16,      h,   mx, my); \ 
  282         put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-sstride, sstride, h+4, mx, my); \ 
  283         put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16,      16,      h,   mx, my); \ 
  303 static 
void put_vp8_pixels16_altivec(uint8_t *
dst, ptrdiff_t dstride, const uint8_t *
src, ptrdiff_t sstride, 
int h, 
int mx, 
int my)
 
  305     register vector 
unsigned char perm;
 
  307     register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
 
  308     register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
 
  309     register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
 
  319     for (
i = 0; 
i < 
h; 
i += 4) {
 
  320         vec_st(load_with_perm_vec(0, 
src, 
perm), 0, 
dst);
 
  321         vec_st(load_with_perm_vec(sstride, 
src, 
perm), dstride, 
dst);
 
  322         vec_st(load_with_perm_vec(sstride2, 
src, 
perm), dstride2, 
dst);
 
  323         vec_st(load_with_perm_vec(sstride3, 
src, 
perm), dstride3, 
dst);
 
  338     c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
 
  339     c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
 
  340     c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
 
  341     c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
 
  343     c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
 
  344     c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
 
  345     c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
 
  346     c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
 
  348     c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
 
  349     c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
 
  350     c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
 
  351     c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
 
  353     c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
 
  354     c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
 
  355     c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
 
  356     c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
 
  358     c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
 
  359     c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
 
  360     c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
 
  361     c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;