00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "libavcodec/dsputil.h"
00022
00023 #include "gcc_fixes.h"
00024
00025 #include "dsputil_altivec.h"
00026 #include "util_altivec.h"
00027
00028 static void vector_fmul_altivec(float *dst, const float *src, int len)
00029 {
00030 int i;
00031 vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
00032 for(i=0; i<len-7; i+=8) {
00033 d0 = vec_ld(0, dst+i);
00034 s = vec_ld(0, src+i);
00035 d1 = vec_ld(16, dst+i);
00036 d0 = vec_madd(d0, s, zero);
00037 d1 = vec_madd(d1, vec_ld(16,src+i), zero);
00038 vec_st(d0, 0, dst+i);
00039 vec_st(d1, 16, dst+i);
00040 }
00041 }
00042
00043 static void vector_fmul_reverse_altivec(float *dst, const float *src0,
00044 const float *src1, int len)
00045 {
00046 int i;
00047 vector float d, s0, s1, h0, l0,
00048 s2, s3, zero = (vector float)vec_splat_u32(0);
00049 src1 += len-4;
00050 for(i=0; i<len-7; i+=8) {
00051 s1 = vec_ld(0, src1-i);
00052 s0 = vec_ld(0, src0+i);
00053 l0 = vec_mergel(s1, s1);
00054 s3 = vec_ld(-16, src1-i);
00055 h0 = vec_mergeh(s1, s1);
00056 s2 = vec_ld(16, src0+i);
00057 s1 = vec_mergeh(vec_mergel(l0,h0),
00058 vec_mergeh(l0,h0));
00059
00060 l0 = vec_mergel(s3, s3);
00061 d = vec_madd(s0, s1, zero);
00062 h0 = vec_mergeh(s3, s3);
00063 vec_st(d, 0, dst+i);
00064 s3 = vec_mergeh(vec_mergel(l0,h0),
00065 vec_mergeh(l0,h0));
00066 d = vec_madd(s2, s3, zero);
00067 vec_st(d, 16, dst+i);
00068 }
00069 }
00070
00071 static void vector_fmul_add_add_altivec(float *dst, const float *src0,
00072 const float *src1, const float *src2,
00073 int src3, int len, int step)
00074 {
00075 int i;
00076 vector float d, s0, s1, s2, t0, t1, edges;
00077 vector unsigned char align = vec_lvsr(0,dst),
00078 mask = vec_lvsl(0, dst);
00079
00080 #if 0 //FIXME: there is still something wrong
00081 if (step == 2) {
00082 int y;
00083 vector float d0, d1, s3, t2;
00084 vector unsigned int sel =
00085 vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0));
00086 t1 = vec_ld(16, dst);
00087 for (i=0,y=0; i<len-3; i+=4,y+=8) {
00088
00089 s0 = vec_ld(0,src0+i);
00090 s1 = vec_ld(0,src1+i);
00091 s2 = vec_ld(0,src2+i);
00092
00093
00094
00095 t2 = vec_ld(31, dst+y);
00096
00097 d = vec_madd(s0,s1,s2);
00098
00099
00100
00101
00102
00103 d0 = vec_perm(t0, t1, mask);
00104
00105 d0 = vec_sel(vec_mergeh(d, d), d0, sel);
00106
00107 edges = vec_perm(t1, t0, mask);
00108
00109 t0 = vec_perm(edges, d0, align);
00110
00111 t1 = vec_perm(d0, edges, align);
00112
00113 vec_stl(t0, 0, dst+y);
00114
00115 d1 = vec_perm(t1, t2, mask);
00116
00117 d1 = vec_sel(vec_mergel(d, d), d1, sel);
00118
00119 edges = vec_perm(t2, t1, mask);
00120
00121 t1 = vec_perm(edges, d1, align);
00122
00123 t2 = vec_perm(d1, edges, align);
00124
00125 vec_stl(t1, 16, dst+y);
00126
00127 t0 = t1;
00128
00129 vec_stl(t2, 31, dst+y);
00130
00131 t1 = t2;
00132 }
00133 } else
00134 #endif
00135 if (step == 1 && src3 == 0)
00136 for (i=0; i<len-3; i+=4) {
00137 t0 = vec_ld(0, dst+i);
00138 t1 = vec_ld(15, dst+i);
00139 s0 = vec_ld(0, src0+i);
00140 s1 = vec_ld(0, src1+i);
00141 s2 = vec_ld(0, src2+i);
00142 edges = vec_perm(t1 ,t0, mask);
00143 d = vec_madd(s0,s1,s2);
00144 t1 = vec_perm(d, edges, align);
00145 t0 = vec_perm(edges, d, align);
00146 vec_st(t1, 15, dst+i);
00147 vec_st(t0, 0, dst+i);
00148 }
00149 else
00150 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
00151 }
00152
00153 static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
00154 {
00155 union {
00156 vector float v;
00157 float s[4];
00158 } vadd;
00159 vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
00160 const vector unsigned char reverse = vcprm(3,2,1,0);
00161 int i,j;
00162
00163 dst += len;
00164 win += len;
00165 src0+= len;
00166
00167 vadd.s[0] = add_bias;
00168 vadd_bias = vec_splat(vadd.v, 0);
00169 zero = (vector float)vec_splat_u32(0);
00170
00171 for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
00172 s0 = vec_ld(i, src0);
00173 s1 = vec_ld(j, src1);
00174 wi = vec_ld(i, win);
00175 wj = vec_ld(j, win);
00176
00177 s1 = vec_perm(s1, s1, reverse);
00178 wj = vec_perm(wj, wj, reverse);
00179
00180 t0 = vec_madd(s0, wj, vadd_bias);
00181 t0 = vec_nmsub(s1, wi, t0);
00182 t1 = vec_madd(s0, wi, vadd_bias);
00183 t1 = vec_madd(s1, wj, t1);
00184 t1 = vec_perm(t1, t1, reverse);
00185
00186 vec_st(t0, i, dst);
00187 vec_st(t1, j, dst);
00188 }
00189 }
00190
00191 static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
00192 {
00193 union {
00194 vector float v;
00195 float s[4];
00196 } mul_u;
00197 int i;
00198 vector float src1, src2, dst1, dst2, mul_v, zero;
00199
00200 zero = (vector float)vec_splat_u32(0);
00201 mul_u.s[0] = mul;
00202 mul_v = vec_splat(mul_u.v, 0);
00203
00204 for(i=0; i<len; i+=8) {
00205 src1 = vec_ctf(vec_ld(0, src+i), 0);
00206 src2 = vec_ctf(vec_ld(16, src+i), 0);
00207 dst1 = vec_madd(src1, mul_v, zero);
00208 dst2 = vec_madd(src2, mul_v, zero);
00209 vec_st(dst1, 0, dst+i);
00210 vec_st(dst2, 16, dst+i);
00211 }
00212 }
00213
00214
00215 static vector signed short
00216 float_to_int16_one_altivec(const float *src)
00217 {
00218 vector float s0 = vec_ld(0, src);
00219 vector float s1 = vec_ld(16, src);
00220 vector signed int t0 = vec_cts(s0, 0);
00221 vector signed int t1 = vec_cts(s1, 0);
00222 return vec_packs(t0,t1);
00223 }
00224
00225 static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
00226 {
00227 int i;
00228 vector signed short d0, d1, d;
00229 vector unsigned char align;
00230 if(((long)dst)&15)
00231 for(i=0; i<len-7; i+=8) {
00232 d0 = vec_ld(0, dst+i);
00233 d = float_to_int16_one_altivec(src+i);
00234 d1 = vec_ld(15, dst+i);
00235 d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
00236 align = vec_lvsr(0, dst+i);
00237 d0 = vec_perm(d1, d, align);
00238 d1 = vec_perm(d, d1, align);
00239 vec_st(d0, 0, dst+i);
00240 vec_st(d1,15, dst+i);
00241 }
00242 else
00243 for(i=0; i<len-7; i+=8) {
00244 d = float_to_int16_one_altivec(src+i);
00245 vec_st(d, 0, dst+i);
00246 }
00247 }
00248
00249 static void
00250 float_to_int16_interleave_altivec(int16_t *dst, const float **src,
00251 long len, int channels)
00252 {
00253 int i;
00254 vector signed short d0, d1, d2, c0, c1, t0, t1;
00255 vector unsigned char align;
00256 if(channels == 1)
00257 float_to_int16_altivec(dst, src[0], len);
00258 else
00259 if (channels == 2) {
00260 if(((long)dst)&15)
00261 for(i=0; i<len-7; i+=8) {
00262 d0 = vec_ld(0, dst + i);
00263 t0 = float_to_int16_one_altivec(src[0] + i);
00264 d1 = vec_ld(31, dst + i);
00265 t1 = float_to_int16_one_altivec(src[1] + i);
00266 c0 = vec_mergeh(t0, t1);
00267 c1 = vec_mergel(t0, t1);
00268 d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
00269 align = vec_lvsr(0, dst + i);
00270 d0 = vec_perm(d2, c0, align);
00271 d1 = vec_perm(c0, c1, align);
00272 vec_st(d0, 0, dst + i);
00273 d0 = vec_perm(c1, d2, align);
00274 vec_st(d1, 15, dst + i);
00275 vec_st(d0, 31, dst + i);
00276 dst+=8;
00277 }
00278 else
00279 for(i=0; i<len-7; i+=8) {
00280 t0 = float_to_int16_one_altivec(src[0] + i);
00281 t1 = float_to_int16_one_altivec(src[1] + i);
00282 d0 = vec_mergeh(t0, t1);
00283 d1 = vec_mergel(t0, t1);
00284 vec_st(d0, 0, dst + i);
00285 vec_st(d1, 16, dst + i);
00286 dst+=8;
00287 }
00288 } else {
00289 DECLARE_ALIGNED(16, int16_t, tmp[len]);
00290 int c, j;
00291 for (c = 0; c < channels; c++) {
00292 float_to_int16_altivec(tmp, src[c], len);
00293 for (i = 0, j = c; i < len; i++, j+=channels) {
00294 dst[j] = tmp[i];
00295 }
00296 }
00297 }
00298 }
00299
00300 void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
00301 {
00302 c->vector_fmul = vector_fmul_altivec;
00303 c->vector_fmul_reverse = vector_fmul_reverse_altivec;
00304 c->vector_fmul_add_add = vector_fmul_add_add_altivec;
00305 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
00306 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
00307 c->vector_fmul_window = vector_fmul_window_altivec;
00308 c->float_to_int16 = float_to_int16_altivec;
00309 c->float_to_int16_interleave = float_to_int16_interleave_altivec;
00310 }
00311 }