FFmpeg
dialoguenhance_template.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #include "libavutil/mem.h"
20 #include "libavutil/tx.h"
21 #include "avfilter.h"
22 #include "internal.h"
23 #include "audio.h"
24 
25 #undef ctype
26 #undef ftype
27 #undef SQRT
28 #undef HYPOT
29 #undef SAMPLE_FORMAT
30 #undef TX_TYPE
31 #undef ONE
32 #undef ZERO
33 #undef HALF
34 #undef SIN
35 #undef CLIP
36 #undef EPSILON
37 #if DEPTH == 32
38 #define SAMPLE_FORMAT float
39 #define SQRT sqrtf
40 #define HYPOT hypotf
41 #define ctype AVComplexFloat
42 #define ftype float
43 #define TX_TYPE AV_TX_FLOAT_RDFT
44 #define ONE 1.f
45 #define ZERO 0.f
46 #define HALF 0.5f
47 #define SIN sinf
48 #define CLIP av_clipf
49 #define EPSILON FLT_EPSILON
50 #else
51 #define SAMPLE_FORMAT double
52 #define SQRT sqrt
53 #define HYPOT hypot
54 #define ctype AVComplexDouble
55 #define ftype double
56 #define TX_TYPE AV_TX_DOUBLE_RDFT
57 #define ONE 1.0
58 #define ZERO 0.0
59 #define HALF 0.5
60 #define SIN sin
61 #define CLIP av_clipd
62 #define EPSILON DBL_EPSILON
63 #endif
64 
65 #define fn3(a,b) a##_##b
66 #define fn2(a,b) fn3(a,b)
67 #define fn(a) fn2(a, SAMPLE_FORMAT)
68 
70 {
71  AudioDialogueEnhanceContext *s = ctx->priv;
72  ftype scale = ONE, iscale = ONE / (s->fft_size * 1.5f);
73  int ret;
74 
75  s->window = av_calloc(s->fft_size, sizeof(ftype));
76  if (!s->window)
77  return AVERROR(ENOMEM);
78  fn(s->window) = s->window;
79  for (int n = 0; n < s->fft_size; n++)
80  fn(s->window)[n] = SIN(M_PI*n/(s->fft_size-1));
81 
82  ret = av_tx_init(&s->tx_ctx[0], &s->tx_fn, TX_TYPE, 0, s->fft_size, &scale, 0);
83  if (ret < 0)
84  return ret;
85 
86  ret = av_tx_init(&s->tx_ctx[1], &s->tx_fn, TX_TYPE, 0, s->fft_size, &scale, 0);
87  if (ret < 0)
88  return ret;
89 
90  ret = av_tx_init(&s->itx_ctx, &s->itx_fn, TX_TYPE, 1, s->fft_size, &iscale, 0);
91  if (ret < 0)
92  return ret;
93 
94  return 0;
95 }
96 
97 static void fn(apply_window)(AudioDialogueEnhanceContext *s,
98  const ftype *in_frame, ftype *out_frame, const int add_to_out_frame)
99 {
100  const ftype *window = fn(s->window);
101  const int fft_size = s->fft_size;
102 
103  if (add_to_out_frame) {
104  for (int i = 0; i < fft_size; i++)
105  out_frame[i] += in_frame[i] * window[i];
106  } else {
107  for (int i = 0; i < fft_size; i++)
108  out_frame[i] = in_frame[i] * window[i];
109  }
110 }
111 
112 static ftype fn(sqr)(ftype x)
113 {
114  return x * x;
115 }
116 
117 static void fn(get_centere)(ctype *left, ctype *right,
118  ctype *center, int N)
119 {
120  for (int i = 0; i < N; i++) {
121  const ftype l_re = left[i].re;
122  const ftype l_im = left[i].im;
123  const ftype r_re = right[i].re;
124  const ftype r_im = right[i].im;
125  const ftype a = HALF * (ONE - SQRT((fn(sqr)(l_re - r_re) + fn(sqr)(l_im - r_im))/
126  (fn(sqr)(l_re + r_re) + fn(sqr)(l_im + r_im) + EPSILON)));
127 
128  center[i].re = a * (l_re + r_re);
129  center[i].im = a * (l_im + r_im);
130  }
131 }
132 
133 static ftype fn(flux)(ftype *curf, ftype *prevf, int N)
134 {
135  ctype *cur = (ctype *)curf;
136  ctype *prev = (ctype *)prevf;
137  ftype sum = ZERO;
138 
139  for (int i = 0; i < N; i++) {
140  ftype c_re = cur[i].re;
141  ftype c_im = cur[i].im;
142  ftype p_re = prev[i].re;
143  ftype p_im = prev[i].im;
144 
145  sum += fn(sqr)(HYPOT(c_re, c_im) - HYPOT(p_re, p_im));
146  }
147 
148  return sum;
149 }
150 
151 static ftype fn(fluxlr)(ftype *lf, ftype *lpf,
152  ftype *rf, ftype *rpf,
153  int N)
154 {
155  ctype *l = (ctype *)lf;
156  ctype *lp = (ctype *)lpf;
157  ctype *r = (ctype *)rf;
158  ctype *rp = (ctype *)rpf;
159  ftype sum = ZERO;
160 
161  for (int i = 0; i < N; i++) {
162  ftype c_re = l[i].re - r[i].re;
163  ftype c_im = l[i].im - r[i].im;
164  ftype p_re = lp[i].re - rp[i].re;
165  ftype p_im = lp[i].im - rp[i].im;
166 
167  sum += fn(sqr)(HYPOT(c_re, c_im) - HYPOT(p_re, p_im));
168  }
169 
170  return sum;
171 }
172 
174 {
175  const ftype vad = a * (fc / (fc + flr) - HALF);
176 
177  return CLIP(vad, ZERO, ONE);
178 }
179 
180 static void fn(get_final)(ftype *c, ftype *l,
181  ftype *r, ftype vad, int N,
182  ftype original, ftype enhance)
183 {
184  ctype *center = (ctype *)c;
185  ctype *left = (ctype *)l;
186  ctype *right = (ctype *)r;
187 
188  for (int i = 0; i < N; i++) {
189  ftype cP = fn(sqr)(center[i].re) + fn(sqr)(center[i].im);
190  ftype lrP = fn(sqr)(left[i].re - right[i].re) + fn(sqr)(left[i].im - right[i].im);
191  ftype G = cP / (cP + lrP + EPSILON);
192  ftype re, im;
193 
194  re = center[i].re * (original + vad * G * enhance);
195  im = center[i].im * (original + vad * G * enhance);
196 
197  center[i].re = re;
198  center[i].im = im;
199  }
200 }
201 
203 {
204  AudioDialogueEnhanceContext *s = ctx->priv;
205  ftype *center = (ftype *)s->center_frame->extended_data[0];
206  ftype *center_prev = (ftype *)s->center_frame->extended_data[1];
207  ftype *left_in = (ftype *)s->in_frame->extended_data[0];
208  ftype *right_in = (ftype *)s->in_frame->extended_data[1];
209  ftype *left_out = (ftype *)s->out_dist_frame->extended_data[0];
210  ftype *right_out = (ftype *)s->out_dist_frame->extended_data[1];
211  ftype *left_samples = (ftype *)s->in->extended_data[0];
212  ftype *right_samples = (ftype *)s->in->extended_data[1];
213  ftype *windowed_left = (ftype *)s->windowed_frame->extended_data[0];
214  ftype *windowed_right = (ftype *)s->windowed_frame->extended_data[1];
215  ftype *windowed_oleft = (ftype *)s->windowed_out->extended_data[0];
216  ftype *windowed_oright = (ftype *)s->windowed_out->extended_data[1];
217  ftype *windowed_pleft = (ftype *)s->windowed_prev->extended_data[0];
218  ftype *windowed_pright = (ftype *)s->windowed_prev->extended_data[1];
219  ftype *left_osamples = (ftype *)out->extended_data[0];
220  ftype *right_osamples = (ftype *)out->extended_data[1];
221  ftype *center_osamples = (ftype *)out->extended_data[2];
222  const int overlap = s->overlap;
223  const int offset = s->fft_size - overlap;
224  const int nb_samples = FFMIN(overlap, s->in->nb_samples);
225  ftype vad;
226 
227  // shift in/out buffers
228  memmove(left_in, &left_in[overlap], offset * sizeof(ftype));
229  memmove(right_in, &right_in[overlap], offset * sizeof(ftype));
230  memmove(left_out, &left_out[overlap], offset * sizeof(ftype));
231  memmove(right_out, &right_out[overlap], offset * sizeof(ftype));
232 
233  memcpy(&left_in[offset], left_samples, nb_samples * sizeof(ftype));
234  memcpy(&right_in[offset], right_samples, nb_samples * sizeof(ftype));
235  memset(&left_out[offset], 0, overlap * sizeof(ftype));
236  memset(&right_out[offset], 0, overlap * sizeof(ftype));
237 
238  fn(apply_window)(s, left_in, windowed_left, 0);
239  fn(apply_window)(s, right_in, windowed_right, 0);
240 
241  s->tx_fn(s->tx_ctx[0], windowed_oleft, windowed_left, sizeof(ftype));
242  s->tx_fn(s->tx_ctx[1], windowed_oright, windowed_right, sizeof(ftype));
243 
244  fn(get_centere)((ctype *)windowed_oleft,
245  (ctype *)windowed_oright,
246  (ctype *)center,
247  s->fft_size / 2 + 1);
248 
249  vad = fn(calc_vad)(fn(flux)(center, center_prev, s->fft_size / 2 + 1),
250  fn(fluxlr)(windowed_oleft, windowed_pleft,
251  windowed_oright, windowed_pright, s->fft_size / 2 + 1), s->voice);
252  vad = vad * 0.1 + 0.9 * fn(s->prev_vad);
253  fn(s->prev_vad) = vad;
254 
255  memcpy(center_prev, center, s->fft_size * sizeof(ftype));
256  memcpy(windowed_pleft, windowed_oleft, s->fft_size * sizeof(ftype));
257  memcpy(windowed_pright, windowed_oright, s->fft_size * sizeof(ftype));
258 
259  fn(get_final)(center, windowed_oleft, windowed_oright, vad, s->fft_size / 2 + 1,
260  s->original, s->enhance);
261 
262  s->itx_fn(s->itx_ctx, windowed_oleft, center, sizeof(ctype));
263 
264  fn(apply_window)(s, windowed_oleft, left_out, 1);
265 
266  memcpy(left_osamples, left_in, overlap * sizeof(ftype));
267  memcpy(right_osamples, right_in, overlap * sizeof(ftype));
268 
269  if (ctx->is_disabled)
270  memset(center_osamples, 0, overlap * sizeof(ftype));
271  else
272  memcpy(center_osamples, left_out, overlap * sizeof(ftype));
273 
274  return 0;
275 }
r
const char * r
Definition: vf_curves.c:127
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
fluxlr
static ftype fn() fluxlr(ftype *lf, ftype *lpf, ftype *rf, ftype *rpf, int N)
Definition: dialoguenhance_template.c:151
out
FILE * out
Definition: movenc.c:55
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:374
ZERO
#define ZERO
Definition: dialoguenhance_template.c:58
fn
#define fn(a)
Definition: dialoguenhance_template.c:67
fc
#define fc(width, name, range_min, range_max)
Definition: cbs_av1.c:464
CLIP
#define CLIP
Definition: dialoguenhance_template.c:61
av_tx_init
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)
Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...
Definition: tx.c:903
HYPOT
#define HYPOT
Definition: dialoguenhance_template.c:53
lpf
static float * lpf(float Fn, float Fc, float tbw, int *num_taps, float att, float *beta, int round)
Definition: asrc_sinc.c:162
window
static SDL_Window * window
Definition: ffplay.c:361
calc_vad
static ftype fn() calc_vad(ftype fc, ftype flr, ftype a)
Definition: dialoguenhance_template.c:173
sqr
static ftype fn() sqr(ftype x)
Definition: dialoguenhance_template.c:112
ftype
#define ftype
Definition: dialoguenhance_template.c:55
HALF
#define HALF
Definition: dialoguenhance_template.c:59
s
#define s(width, name)
Definition: cbs_vp9.c:198
get_final
static void fn() get_final(ftype *c, ftype *l, ftype *r, ftype vad, int N, ftype original, ftype enhance)
Definition: dialoguenhance_template.c:180
de_tx_init
static int fn() de_tx_init(AVFilterContext *ctx)
Definition: dialoguenhance_template.c:69
EPSILON
#define EPSILON
Definition: dialoguenhance_template.c:62
ctx
AVFormatContext * ctx
Definition: movenc.c:49
get_centere
static void fn() get_centere(ctype *left, ctype *right, ctype *center, int N)
Definition: dialoguenhance_template.c:117
SIN
#define SIN
Definition: dialoguenhance_template.c:60
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ctype
#define ctype
Definition: dialoguenhance_template.c:54
ONE
#define ONE
Definition: dialoguenhance_template.c:57
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
N
#define N
Definition: af_mcompand.c:54
M_PI
#define M_PI
Definition: mathematics.h:67
internal.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
TX_TYPE
#define TX_TYPE
Definition: dialoguenhance_template.c:56
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
de_stereo
static int fn() de_stereo(AVFilterContext *ctx, AVFrame *out)
Definition: dialoguenhance_template.c:202
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
apply_window
static void fn() apply_window(AudioDialogueEnhanceContext *s, const ftype *in_frame, ftype *out_frame, const int add_to_out_frame)
Definition: dialoguenhance_template.c:97
ret
ret
Definition: filter_design.txt:187
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
avfilter.h
G
#define G
Definition: huffyuv.h:43
AVFilterContext
An instance of a filter.
Definition: avfilter.h:407
mem.h
audio.h
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
SQRT
#define SQRT
Definition: dialoguenhance_template.c:52
tx.h
flux
static ftype fn() flux(ftype *curf, ftype *prevf, int N)
Definition: dialoguenhance_template.c:133