FFmpeg
af_whisper.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025 Vittorio Palmisano
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public License
8  * as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdio.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 
25 #include <whisper.h>
26 
27 #include "libavutil/avutil.h"
28 #include "libavutil/opt.h"
30 #include "libavutil/samplefmt.h"
31 #include "libavfilter/avfilter.h"
32 #include "libavfilter/audio.h"
33 #include "libavutil/mem.h"
34 #include "libavutil/avstring.h"
35 #include "libavutil/internal.h"
36 #include "libavformat/avio.h"
37 #include "libavutil/thread.h"
38 
39 #include "formats.h"
40 
41 typedef struct WhisperContext {
42  const AVClass *class;
43  char *model_path;
44  char *language;
45  bool use_gpu;
51 
53  char *destination;
54  char *format;
55 
56  struct whisper_context *ctx_wsp;
57  struct whisper_vad_context *ctx_vad;
58  struct whisper_vad_params vad_params;
59 
60  float *audio_buffer;
65 
66  int eof;
68 
70  int index;
72 
73 static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
74 {
77  switch (level) {
78  case GGML_LOG_LEVEL_ERROR:
80  break;
81  case GGML_LOG_LEVEL_WARN:
83  break;
84  }
85  av_log(ctx, av_log_level, "%s", text);
86 }
87 
88 static int init(AVFilterContext *ctx)
89 {
90  WhisperContext *wctx = ctx->priv;
91 
92  static AVOnce init_static_once = AV_ONCE_INIT;
93  ff_thread_once(&init_static_once, ggml_backend_load_all);
94 
95  whisper_log_set(cb_log, ctx);
96 
97  // Init whisper context
98  if (!wctx->model_path) {
99  av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
100  return AVERROR(EINVAL);
101  }
102 
103  struct whisper_context_params params = whisper_context_default_params();
104  params.use_gpu = wctx->use_gpu;
105  params.gpu_device = wctx->gpu_device;
106 
107  wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
108  if (wctx->ctx_wsp == NULL) {
109  av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
110  return AVERROR(EIO);
111  }
112 
113  // Init buffer
114  wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);
115  wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));
116  if (!wctx->audio_buffer)
117  return AVERROR(ENOMEM);
118 
119  // Init VAD model context
120  if (wctx->vad_model_path) {
121  struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
122  ctx_params.n_threads = ff_filter_get_nb_threads(ctx);
123  // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
124  ctx_params.gpu_device = wctx->gpu_device;
125  wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);
126 
127  wctx->vad_params = whisper_vad_default_params();
128  wctx->vad_params.threshold = wctx->vad_threshold;
129  wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);
130  wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);
131  wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);
132  wctx->vad_params.speech_pad_ms = 0;
133  wctx->vad_params.samples_overlap = 0;
134  }
135 
136  wctx->next_pts = AV_NOPTS_VALUE;
137 
138  if (wctx->destination && strcmp("", wctx->destination)) {
139  const char *dst = wctx->destination;
140  if (!strcmp("-", dst))
141  dst = "pipe:1";
142  int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);
143 
144  if (ret < 0) {
145  av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));
146  return ret;
147  }
148 
150  }
151 
153  "Whisper filter initialized: model: %s lang: %s queue: %ld ms\n",
154  wctx->model_path, wctx->language, wctx->queue / 1000);
155 
156  return 0;
157 }
158 
160 {
161  WhisperContext *wctx = ctx->priv;
162 
163  if (wctx->audio_buffer_fill_size > 0) {
165  "Remaining audio buffer %d samples (%d seconds) after stopping\n",
166  wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
167  }
168 
169  if (wctx->ctx_vad) {
170  whisper_vad_free(wctx->ctx_vad);
171  wctx->ctx_vad = NULL;
172  }
173 
174  if (wctx->ctx_wsp) {
175  whisper_free(wctx->ctx_wsp);
176  wctx->ctx_wsp = NULL;
177  }
178 
179  av_freep(&wctx->audio_buffer);
180 
181  if (wctx->avio_context)
182  avio_closep(&wctx->avio_context);
183 }
184 
186 {
187  WhisperContext *wctx = ctx->priv;
189 
190  if (!wctx->ctx_wsp || samples == 0)
191  return;
192 
193  const int64_t timestamp_ms = wctx->audio_buffer_start_ms;
194  const float duration = (float) samples / WHISPER_SAMPLE_RATE;
195 
197  "run transcription at %ld ms, %d/%d samples (%.2f seconds)...\n",
198  timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);
199 
200  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
201  params.language = wctx->language;
202  params.n_threads = ff_filter_get_nb_threads(ctx);
203  params.print_special = 0;
204  params.print_progress = 0;
205  params.print_realtime = 0;
206  params.print_timestamps = 0;
207 
208  if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
209  av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
210  return;
211  }
212 
213  const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
214  char *segments_text = NULL;
215 
216  for (int i = 0; i < n_segments; ++i) {
217  const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
218  if (av_isspace(text[0]))
219  text++;
220  char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");
221 
222  if (av_strnlen(text_cleaned, 1) == 0) {
223  av_freep(&text_cleaned);
224  continue;
225  }
226 
227  const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
228  const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
229  const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
230 
231  av_log(ctx, AV_LOG_DEBUG, " [%ld-%ld%s]: \"%s\"\n",
232  timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);
233 
234  if (segments_text) {
235  char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);
236  av_freep(&segments_text);
237  segments_text = new_text;
238  } else
239  segments_text = av_strdup(text_cleaned);
240 
241  if (wctx->avio_context) {
242  const int64_t start_t = timestamp_ms + t0_ms;
243  const int64_t end_t = timestamp_ms + t1_ms;
244  char *buf = NULL;
245 
246  if (!av_strcasecmp(wctx->format, "srt")) {
247  buf =
249  ("%d\n%02ld:%02ld:%02ld,%03ld --> %02ld:%02ld:%02ld,%03ld\n%s\n\n",
250  wctx->index, start_t / 3600000,
251  (start_t / 60000) % 60, (start_t / 1000) % 60,
252  start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
253  (end_t / 1000) % 60, end_t % 1000, text_cleaned);
254  } else if (!av_strcasecmp(wctx->format, "json")) {
255  buf = av_asprintf("{\"start\":%ld,\"end\":%ld,\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
256  } else
257  buf = av_strdup(text_cleaned);
258 
259  if (buf) {
260  avio_write(wctx->avio_context, buf, strlen(buf));
261  av_freep(&buf);
262  }
263  }
264 
265  av_freep(&text_cleaned);
266  }
267 
268  wctx->index++;
269 
270  AVDictionary **metadata = &frame->metadata;
271  if (metadata && segments_text) {
272  av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
273  char *duration_text = av_asprintf("%f", duration);
274  av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
275  }
276  av_freep(&segments_text);
277 
278  if (wctx->audio_buffer_fill_size > samples) {
279  memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,
280  (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));
281  wctx->audio_buffer_start_ms += duration * 1000;
282  }
285 }
286 
288 {
289  AVFilterContext *ctx = inlink->dst;
290  WhisperContext *wctx = ctx->priv;
291  AVFilterLink *outlink = ctx->outputs[0];
292 
293  const int samples = frame->nb_samples;
294  const float *input_data = (const float *) frame->data[0];
295 
298  }
299 
300  if (!wctx->audio_buffer_fill_size)
302  (AVRational) {1000, 1},
303  (AVRational) {inlink->time_base.den, inlink->time_base.num});
304  memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
306 
307  if (wctx->ctx_vad
308  && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
309  av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
310  struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
311  wctx->vad_params,
312  wctx->audio_buffer,
313  wctx->audio_buffer_fill_size);
315 
316  if (!segments) {
317  av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
318  } else {
319  int n_segments = whisper_vad_segments_n_segments(segments);
320 
321  if (n_segments > 0) {
322  const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
323  const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
324  int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
325 
326  if (end_pos <= wctx->audio_buffer_fill_size -
327  av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
329  "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
330  n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
331  run_transcription(ctx, frame, end_pos);
332  }
333  }
334 
335  whisper_vad_free_segments(segments);
336  }
337  } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
339 
340  wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {
341  1, inlink->sample_rate}
342  , inlink->time_base);
343  return ff_filter_frame(outlink, frame);
344 }
345 
346 static int push_last_frame(AVFilterLink *outlink)
347 {
348  AVFilterContext *ctx = outlink->src;
349  WhisperContext *wctx = ctx->priv;
350  AVFrame *frame;
351  int n_out = 1;
352 
353  if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
354  return 0;
355  frame = ff_get_audio_buffer(outlink, n_out);
356  if (!frame)
357  return AVERROR(ENOMEM);
358 
359  av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);
360 
361  frame->pts = wctx->next_pts;
362  if (wctx->next_pts != AV_NOPTS_VALUE)
363  wctx->next_pts += av_rescale_q(n_out, (AVRational) {
364  1, outlink->sample_rate}
365  , outlink->time_base);
366 
368 
369  return ff_filter_frame(outlink, frame);
370 }
371 
373 {
374  AVFilterLink *inlink = ctx->inputs[0];
375  AVFilterLink *outlink = ctx->outputs[0];
376  WhisperContext *wctx = ctx->priv;
377  int64_t pts;
378  int status;
379 
381 
382  if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
383  AVFrame *frame = NULL;
384  int ret;
385 
387  if (ret < 0)
388  return ret;
389  if (ret > 0)
390  return filter_frame(inlink, frame);
391  }
392 
393  if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
394  wctx->eof = status == AVERROR_EOF;
395 
396  if (wctx->eof) {
397  push_last_frame(outlink);
398 
399  ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
400  return 0;
401  }
402 
404 
405  return FFERROR_NOT_READY;
406 }
407 
409  AVFilterFormatsConfig **cfg_in,
410  AVFilterFormatsConfig **cfg_out)
411 {
413  AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };
414  int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };
415  int ret;
416 
418  if (ret < 0)
419  return ret;
420 
421  ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);
422  if (ret < 0)
423  return ret;
424 
425  return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
426 }
427 
428 #define OFFSET(x) offsetof(WhisperContext, x)
429 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
430 #define HOURS 3600000000
431 
432 static const AVOption whisper_options[] = {
433  { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
434  { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
435  { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
436  { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
437  { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
438  { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
439  { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
440  { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
441  { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
442  { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
443  { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
444  { NULL }
445 };
446 
447 static const AVClass whisper_class = {
448  .class_name = "whisper",
449  .item_name = av_default_item_name,
450  .option = whisper_options,
451  .version = LIBAVUTIL_VERSION_INT,
452 };
453 
455  .p.name = "whisper",
456  .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
457  .p.priv_class = &whisper_class,
458  .p.flags = AVFILTER_FLAG_METADATA_ONLY,
459  .init = init,
460  .uninit = uninit,
461  .activate = activate,
462  .priv_size = sizeof(WhisperContext),
466 };
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:98
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:216
level
uint8_t level
Definition: svq3.c:208
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
whisper_options
static const AVOption whisper_options[]
Definition: af_whisper.c:432
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1067
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:948
thread.h
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
int64_t
long long int64_t
Definition: coverity.c:34
metadata
Stream codec metadata
Definition: ogg-flac-chained-meta.txt:2
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_asprintf
char * av_asprintf(const char *fmt,...)
Definition: avstring.c:115
av_strcasecmp
int av_strcasecmp(const char *a, const char *b)
Locale-independent case-insensitive compare.
Definition: avstring.c:208
av_isspace
static av_const int av_isspace(int c)
Locale-independent conversion of ASCII isspace.
Definition: avstring.h:218
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:263
sample_rates
static const int sample_rates[]
Definition: dcaenc.h:34
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:427
WhisperContext::audio_buffer_vad_size
int audio_buffer_vad_size
Definition: af_whisper.c:63
av_samples_set_silence
int av_samples_set_silence(uint8_t *const *audio_data, int offset, int nb_samples, int nb_channels, enum AVSampleFormat sample_fmt)
Fill an audio buffer with silence.
Definition: samplefmt.c:246
AVOption
AVOption.
Definition: opt.h:429
WhisperContext::language
char * language
Definition: af_whisper.c:44
avio_open
int avio_open(AVIOContext **s, const char *filename, int flags)
Create and initialize a AVIOContext for accessing the resource indicated by url.
Definition: avio.c:498
AV_OPT_TYPE_DURATION
@ AV_OPT_TYPE_DURATION
Underlying C type is int64_t.
Definition: opt.h:319
WhisperContext
Definition: af_whisper.c:41
ff_set_common_channel_layouts_from_list2
int ff_set_common_channel_layouts_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const AVChannelLayout *fmts)
Definition: formats.c:965
WhisperContext::audio_buffer_queue_size
int audio_buffer_queue_size
Definition: af_whisper.c:61
WhisperContext::use_gpu
bool use_gpu
Definition: af_whisper.c:45
AVDictionary
Definition: dict.c:32
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:220
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:638
WhisperContext::avio_context
AVIOContext * avio_context
Definition: af_whisper.c:69
formats.h
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1517
whisper_class
static const AVClass whisper_class
Definition: af_whisper.c:447
ff_af_whisper
const FFFilter ff_af_whisper
Definition: af_whisper.c:454
samplefmt.h
run_transcription
static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
Definition: af_whisper.c:185
WhisperContext::vad_threshold
float vad_threshold
Definition: af_whisper.c:48
pts
static int64_t pts
Definition: transcode_aac.c:644
AV_DICT_DONT_STRDUP_VAL
#define AV_DICT_DONT_STRDUP_VAL
Take ownership of a value that's been allocated with av_malloc() or another memory allocation functio...
Definition: dict.h:79
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:205
init
static int init(AVFilterContext *ctx)
Definition: af_whisper.c:88
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:210
WhisperContext::gpu_device
int gpu_device
Definition: af_whisper.c:46
FFFilter
Definition: filters.h:266
duration
int64_t duration
Definition: movenc.c:65
WhisperContext::audio_buffer_start_ms
int64_t audio_buffer_start_ms
Definition: af_whisper.c:64
float
float
Definition: af_crystalizer.c:122
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:628
av_log_level
static int av_log_level
Definition: log.c:58
AVIO_FLAG_WRITE
#define AVIO_FLAG_WRITE
write-only
Definition: avio.h:618
ff_set_common_samplerates_from_list2
int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)
Definition: formats.c:989
AV_LOG_DEBUG
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
Definition: log.h:231
ctx
AVFormatContext * ctx
Definition: movenc.c:49
av_rescale_q
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)
Rescale a 64-bit integer by 2 rational numbers.
Definition: mathematics.c:142
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:264
if
if(ret)
Definition: filter_design.txt:179
LIBAVUTIL_VERSION_INT
#define LIBAVUTIL_VERSION_INT
Definition: version.h:85
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:203
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:76
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
Definition: af_whisper.c:287
NULL
#define NULL
Definition: coverity.c:32
format
New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format
Definition: swscale-v2.txt:14
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
av_strireplace
char * av_strireplace(const char *str, const char *from, const char *to)
Locale-independent strings replace.
Definition: avstring.c:230
av_strnlen
size_t static size_t av_strnlen(const char *s, size_t len)
Get the count of continuous non zero chars starting from the beginning.
Definition: avstring.h:141
av_default_item_name
const char * av_default_item_name(void *ptr)
Return the context name.
Definition: log.c:241
WhisperContext::audio_buffer_fill_size
int audio_buffer_fill_size
Definition: af_whisper.c:62
WhisperContext::vad_model_path
char * vad_model_path
Definition: af_whisper.c:47
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:34
HOURS
#define HOURS
Definition: af_whisper.c:430
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1464
AVOnce
#define AVOnce
Definition: thread.h:202
AVFilterFormatsConfig
Lists of formats / etc.
Definition: avfilter.h:121
WhisperContext::model_path
char * model_path
Definition: af_whisper.c:43
ff_inlink_queued_frames
size_t ff_inlink_queued_frames(AVFilterLink *link)
Get the number of frames available on the link.
Definition: avfilter.c:1480
WhisperContext::format
char * format
Definition: af_whisper.c:54
AVIOContext
Bytestream IO Context.
Definition: avio.h:160
activate
static int activate(AVFilterContext *ctx)
Definition: af_whisper.c:372
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
AVChannelLayout
An AVChannelLayout holds information about the channel layout of audio data.
Definition: channel_layout.h:319
WhisperContext::index
int index
Definition: af_whisper.c:70
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
av_err2str
#define av_err2str(errnum)
Convenience macro, the return value should be used only directly in function arguments but never stan...
Definition: error.h:122
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
avio.h
WhisperContext::eof
int eof
Definition: af_whisper.c:66
AV_NOPTS_VALUE
#define AV_NOPTS_VALUE
Undefined timestamp value.
Definition: avutil.h:247
user_data
static int FUNC() user_data(CodedBitstreamContext *ctx, RWContext *rw, MPEG2RawUserData *current)
Definition: cbs_mpeg2_syntax_template.c:59
avio_write
void avio_write(AVIOContext *s, const unsigned char *buf, int size)
Definition: aviobuf.c:206
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
WhisperContext::vad_min_silence_duration
int64_t vad_min_silence_duration
Definition: af_whisper.c:50
query_formats
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
Definition: af_whisper.c:408
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:221
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
Definition: opt.h:271
AVIOContext::direct
int direct
avio_read and avio_write should if possible be satisfied directly instead of going through a buffer,...
Definition: avio.h:268
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
internal.h
AV_TIME_BASE
#define AV_TIME_BASE
Internal time base represented as integer.
Definition: avutil.h:253
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:32
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:845
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
FILTER_QUERY_FUNC2
#define FILTER_QUERY_FUNC2(func)
Definition: filters.h:240
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
OFFSET
#define OFFSET(x)
Definition: af_whisper.c:428
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
uninit
static void uninit(AVFilterContext *ctx)
Definition: af_whisper.c:159
language
Undefined Behavior In the C language
Definition: undefined.txt:3
cb_log
static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
Definition: af_whisper.c:73
ret
ret
Definition: filter_design.txt:187
AVClass::class_name
const char * class_name
The name of the class; usually it is the same name as the context structure type to which the AVClass...
Definition: log.h:81
frame
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
Definition: filter_design.txt:265
WhisperContext::audio_buffer
float * audio_buffer
Definition: af_whisper.c:60
FF_COUNT2LAYOUT
#define FF_COUNT2LAYOUT(c)
Encode a channel count as a channel layout.
Definition: formats.h:102
push_last_frame
static int push_last_frame(AVFilterLink *outlink)
Definition: af_whisper.c:346
status
ov_status_e status
Definition: dnn_backend_openvino.c:100
ff_set_common_formats_from_list2
int ff_set_common_formats_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *fmts)
Definition: formats.c:1085
channel_layout.h
WhisperContext::ctx_vad
struct whisper_vad_context * ctx_vad
Definition: af_whisper.c:57
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Underlying C type is int.
Definition: opt.h:259
avfilter.h
WhisperContext::ctx_wsp
struct whisper_context * ctx_wsp
Definition: af_whisper.c:56
AVFILTER_FLAG_METADATA_ONLY
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Definition: avfilter.h:183
WhisperContext::destination
char * destination
Definition: af_whisper.c:53
samples
Filter the word “frame” indicates either a video frame or a group of audio samples
Definition: filter_design.txt:8
AVFilterContext
An instance of a filter.
Definition: avfilter.h:274
AVIO_FLAG_DIRECT
#define AVIO_FLAG_DIRECT
Use direct mode.
Definition: avio.h:644
WhisperContext::next_pts
int64_t next_pts
Definition: af_whisper.c:67
av_strdup
char * av_strdup(const char *s)
Duplicate a string.
Definition: mem.c:272
FFFilter::p
AVFilter p
The public AVFilter.
Definition: filters.h:270
FLAGS
#define FLAGS
Definition: af_whisper.c:429
avutil.h
mem.h
audio.h
AV_OPT_TYPE_BOOL
@ AV_OPT_TYPE_BOOL
Underlying C type is int.
Definition: opt.h:327
avio_closep
int avio_closep(AVIOContext **s)
Close the resource accessed by the AVIOContext *s, free it and set the pointer pointing to it to NULL...
Definition: avio.c:650
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:35
av_dict_set
int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags)
Set the given entry in *pm, overwriting an existing entry.
Definition: dict.c:86
WhisperContext::vad_min_speech_duration
int64_t vad_min_speech_duration
Definition: af_whisper.c:49
WhisperContext::vad_params
struct whisper_vad_params vad_params
Definition: af_whisper.c:58
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
WhisperContext::queue
int64_t queue
Definition: af_whisper.c:52
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276
input_data
static void input_data(MLPEncodeContext *ctx, MLPSubstream *s, uint8_t **const samples, int nb_samples)
Wrapper function for inputting data in two different bit-depths.
Definition: mlpenc.c:1224
AV_SAMPLE_FMT_FLT
@ AV_SAMPLE_FMT_FLT
float
Definition: samplefmt.h:60