FFmpeg
af_whisper.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025 Vittorio Palmisano
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public License
8  * as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdio.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 
25 #include <whisper.h>
26 
27 #include "libavutil/avutil.h"
28 #include "libavutil/opt.h"
30 #include "libavutil/samplefmt.h"
31 #include "libavfilter/avfilter.h"
32 #include "libavfilter/audio.h"
33 #include "libavutil/mem.h"
34 #include "libavutil/avstring.h"
35 #include "libavutil/internal.h"
36 #include "libavformat/avio.h"
37 #include "libavutil/thread.h"
38 
39 #include "formats.h"
40 
41 typedef struct WhisperContext {
42  const AVClass *class;
43  char *model_path;
44  const char *language;
45  char *language_str;
46  bool translate;
47  bool use_gpu;
53 
55  char *destination;
56  char *format;
57  int max_len;
58 
59  struct whisper_context *ctx_wsp;
60  struct whisper_vad_context *ctx_vad;
61  struct whisper_vad_params vad_params;
62 
63  float *audio_buffer;
68 
69  int eof;
71 
73  int index;
75 
76 static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
77 {
80  switch (level) {
81  case GGML_LOG_LEVEL_ERROR:
83  break;
84  case GGML_LOG_LEVEL_WARN:
86  break;
87  }
88  av_log(ctx, av_log_level, "%s", text);
89 }
90 
91 static int init(AVFilterContext *ctx)
92 {
93  WhisperContext *wctx = ctx->priv;
94 
95  static AVOnce init_static_once = AV_ONCE_INIT;
96  ff_thread_once(&init_static_once, ggml_backend_load_all);
97 
98  whisper_log_set(cb_log, ctx);
99 
100  // Init whisper context
101  if (!wctx->model_path) {
102  av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
103  return AVERROR(EINVAL);
104  }
105 
106  struct whisper_context_params params = whisper_context_default_params();
107  params.use_gpu = wctx->use_gpu;
108  params.gpu_device = wctx->gpu_device;
109 
110  wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
111  if (wctx->ctx_wsp == NULL) {
112  av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
113  return AVERROR(EIO);
114  }
115 
116  // Init buffer
117  wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);
118  wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));
119  if (!wctx->audio_buffer)
120  return AVERROR(ENOMEM);
121 
122  // Init VAD model context
123  if (wctx->vad_model_path) {
124  struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
125  ctx_params.n_threads = ff_filter_get_nb_threads(ctx);
126  // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
127  ctx_params.gpu_device = wctx->gpu_device;
128  wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);
129 
130  wctx->vad_params = whisper_vad_default_params();
131  wctx->vad_params.threshold = wctx->vad_threshold;
132  wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);
133  wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);
134  wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);
135  wctx->vad_params.speech_pad_ms = 0;
136  wctx->vad_params.samples_overlap = 0;
137  }
138 
139  wctx->next_pts = AV_NOPTS_VALUE;
140 
141  if (wctx->destination && strcmp("", wctx->destination)) {
142  const char *dst = wctx->destination;
143  if (!strcmp("-", dst))
144  dst = "pipe:1";
145  int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);
146 
147  if (ret < 0) {
148  av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));
149  return ret;
150  }
151 
153  }
154 
155  if (!whisper_is_multilingual(wctx->ctx_wsp)) {
156  if (!wctx->translate && strcmp(wctx->language_str, "auto") == 0) {
158  "Multilingual model not provided. Non-English audio may not be correctly transcribed.\n");
159  } else if (wctx->translate || (strcmp(wctx->language_str, "auto") != 0 && strcmp(wctx->language_str, "en") != 0)) {
161  "%s requested but multilingual model not provided.\n", wctx->translate ? "Translation" : "Transcription");
162  return AVERROR(ENOSYS);
163  }
164  wctx->language = "en";
165  } else
166  wctx->language = wctx->language_str;
167 
169  "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",
170  wctx->model_path, wctx->language, wctx->queue / 1000);
171 
172  return 0;
173 }
174 
176 {
177  WhisperContext *wctx = ctx->priv;
178 
179  if (wctx->audio_buffer_fill_size > 0) {
181  "Remaining audio buffer %d samples (%d seconds) after stopping\n",
182  wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
183  }
184 
185  if (wctx->ctx_vad) {
186  whisper_vad_free(wctx->ctx_vad);
187  wctx->ctx_vad = NULL;
188  }
189 
190  if (wctx->ctx_wsp) {
191  whisper_free(wctx->ctx_wsp);
192  wctx->ctx_wsp = NULL;
193  }
194 
195  av_freep(&wctx->audio_buffer);
196 
197  if (wctx->avio_context)
198  avio_closep(&wctx->avio_context);
199 }
200 
202 {
203  WhisperContext *wctx = ctx->priv;
205 
206  if (!wctx->ctx_wsp || samples == 0)
207  return;
208 
209  const int64_t timestamp_ms = wctx->audio_buffer_start_ms;
210  const float duration = (float) samples / WHISPER_SAMPLE_RATE;
211 
213  "run transcription at %" PRId64 " ms, %d/%d samples (%.2f seconds)...\n",
214  timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);
215 
216  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
217  params.language = wctx->language;
218  params.translate = wctx->translate;
219  params.n_threads = ff_filter_get_nb_threads(ctx);
220  params.print_special = 0;
221  params.print_progress = 0;
222  params.print_realtime = 0;
223  params.print_timestamps = 0;
224  params.max_len = wctx->max_len;
225  params.token_timestamps = (wctx->max_len > 0);
226  params.split_on_word = (wctx->max_len > 0);
227 
228  if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
229  av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
230  return;
231  }
232 
233  const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
234  char *segments_text = NULL;
235 
236  for (int i = 0; i < n_segments; ++i) {
237  const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
238  if (av_isspace(text[0]))
239  text++;
240  char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");
241 
242  if (av_strnlen(text_cleaned, 1) == 0) {
243  av_freep(&text_cleaned);
244  continue;
245  }
246 
247  // Skip segments that are parts of [BLANK_AUDIO] when max_len splits them
248  if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 ||
249  strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 ||
250  strcmp(text_cleaned, "AUDIO") == 0)) {
251  av_freep(&text_cleaned);
252  continue;
253  }
254 
255  const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
256  const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
257  const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
258 
259  av_log(ctx, AV_LOG_DEBUG, " [%" PRId64 "-%" PRId64 "%s]: \"%s\"\n",
260  timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);
261 
262  if (segments_text) {
263  char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);
264  av_freep(&segments_text);
265  segments_text = new_text;
266  } else
267  segments_text = av_strdup(text_cleaned);
268 
269  if (wctx->avio_context) {
270  const int64_t start_t = timestamp_ms + t0_ms;
271  const int64_t end_t = timestamp_ms + t1_ms;
272  char *buf = NULL;
273 
274  if (!av_strcasecmp(wctx->format, "srt")) {
275  buf =
277  ("%d\n%02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 " --> %02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 "\n%s\n\n",
278  wctx->index, start_t / 3600000,
279  (start_t / 60000) % 60, (start_t / 1000) % 60,
280  start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
281  (end_t / 1000) % 60, end_t % 1000, text_cleaned);
282 
283  wctx->index++;
284  } else if (!av_strcasecmp(wctx->format, "json")) {
285  buf = av_asprintf("{\"start\":%" PRId64 ",\"end\":%" PRId64 ",\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
286  } else
287  buf = av_asprintf("%s\n", text_cleaned);
288 
289  if (buf) {
290  avio_write(wctx->avio_context, buf, strlen(buf));
291  av_freep(&buf);
292  }
293  }
294 
295  av_freep(&text_cleaned);
296  }
297 
298  AVDictionary **metadata = &frame->metadata;
299  if (metadata && segments_text) {
300  av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
301  char *duration_text = av_asprintf("%f", duration);
302  av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
303  }
304  av_freep(&segments_text);
305 
306  if (wctx->audio_buffer_fill_size > samples) {
307  memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,
308  (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));
309  wctx->audio_buffer_start_ms += duration * 1000;
310  }
313 }
314 
316 {
317  AVFilterContext *ctx = inlink->dst;
318  WhisperContext *wctx = ctx->priv;
319  AVFilterLink *outlink = ctx->outputs[0];
320 
321  const int samples = frame->nb_samples;
322  const float *input_data = (const float *) frame->data[0];
323 
326  }
327 
328  if (!wctx->audio_buffer_fill_size)
330  (AVRational) {1000, 1},
331  (AVRational) {inlink->time_base.den, inlink->time_base.num});
332  memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
334 
335  if (wctx->ctx_vad
336  && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
337  av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
338  struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
339  wctx->vad_params,
340  wctx->audio_buffer,
341  wctx->audio_buffer_fill_size);
343 
344  if (!segments) {
345  av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
346  } else {
347  int n_segments = whisper_vad_segments_n_segments(segments);
348 
349  if (n_segments > 0) {
350  const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
351  const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
352  int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
353 
354  if (end_pos <= wctx->audio_buffer_fill_size -
355  av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
357  "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
358  n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
359  run_transcription(ctx, frame, end_pos);
360  }
361  }
362 
363  whisper_vad_free_segments(segments);
364  }
365  } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
367 
368  wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {
369  1, inlink->sample_rate}
370  , inlink->time_base);
371  return ff_filter_frame(outlink, frame);
372 }
373 
374 static int push_last_frame(AVFilterLink *outlink)
375 {
376  AVFilterContext *ctx = outlink->src;
377  WhisperContext *wctx = ctx->priv;
378  AVFrame *frame;
379  int n_out = 1;
380 
381  if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
382  return 0;
383  frame = ff_get_audio_buffer(outlink, n_out);
384  if (!frame)
385  return AVERROR(ENOMEM);
386 
387  av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);
388 
389  frame->pts = wctx->next_pts;
390  if (wctx->next_pts != AV_NOPTS_VALUE)
391  wctx->next_pts += av_rescale_q(n_out, (AVRational) {
392  1, outlink->sample_rate}
393  , outlink->time_base);
394 
396 
397  return ff_filter_frame(outlink, frame);
398 }
399 
401 {
402  AVFilterLink *inlink = ctx->inputs[0];
403  AVFilterLink *outlink = ctx->outputs[0];
404  WhisperContext *wctx = ctx->priv;
405  int64_t pts;
406  int status;
407 
409 
410  if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
411  AVFrame *frame = NULL;
412  int ret;
413 
415  if (ret < 0)
416  return ret;
417  if (ret > 0)
418  return filter_frame(inlink, frame);
419  }
420 
421  if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
422  wctx->eof = status == AVERROR_EOF;
423 
424  if (wctx->eof) {
425  push_last_frame(outlink);
426 
427  ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
428  return 0;
429  }
430 
432 
433  return FFERROR_NOT_READY;
434 }
435 
437  AVFilterFormatsConfig **cfg_in,
438  AVFilterFormatsConfig **cfg_out)
439 {
441  AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };
442  int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };
443  int ret;
444 
446  if (ret < 0)
447  return ret;
448 
449  ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);
450  if (ret < 0)
451  return ret;
452 
453  return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
454 }
455 
456 #define OFFSET(x) offsetof(WhisperContext, x)
457 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
458 #define HOURS 3600000000
459 
460 static const AVOption whisper_options[] = {
461  { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
462  { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language_str), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
463  { "translate", "Translate from source language to English", OFFSET(translate), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, .flags = FLAGS },
464  { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
465  { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
466  { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
467  { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
468  { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
469  { "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
470  { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
471  { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
472  { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
473  { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
474  { NULL }
475 };
476 
477 static const AVClass whisper_class = {
478  .class_name = "whisper",
479  .item_name = av_default_item_name,
480  .option = whisper_options,
481  .version = LIBAVUTIL_VERSION_INT,
482 };
483 
485  .p.name = "whisper",
486  .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
487  .p.priv_class = &whisper_class,
488  .p.flags = AVFILTER_FLAG_METADATA_ONLY,
489  .init = init,
490  .uninit = uninit,
491  .activate = activate,
492  .priv_size = sizeof(WhisperContext),
496 };
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:74
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:216
level
uint8_t level
Definition: svq3.c:208
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
whisper_options
static const AVOption whisper_options[]
Definition: af_whisper.c:460
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1067
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:931
thread.h
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
int64_t
long long int64_t
Definition: coverity.c:34
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_asprintf
char * av_asprintf(const char *fmt,...)
Definition: avstring.c:115
av_strcasecmp
int av_strcasecmp(const char *a, const char *b)
Locale-independent case-insensitive compare.
Definition: avstring.c:208
av_isspace
static av_const int av_isspace(int c)
Locale-independent conversion of ASCII isspace.
Definition: avstring.h:218
WhisperContext::language
const char * language
Definition: af_whisper.c:44
sample_rates
static const int sample_rates[]
Definition: dcaenc.h:34
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:434
WhisperContext::audio_buffer_vad_size
int audio_buffer_vad_size
Definition: af_whisper.c:66
av_samples_set_silence
int av_samples_set_silence(uint8_t *const *audio_data, int offset, int nb_samples, int nb_channels, enum AVSampleFormat sample_fmt)
Fill an audio buffer with silence.
Definition: samplefmt.c:246
AVOption
AVOption.
Definition: opt.h:429
avio_open
int avio_open(AVIOContext **s, const char *filename, int flags)
Create and initialize a AVIOContext for accessing the resource indicated by url.
Definition: avio.c:498
AV_OPT_TYPE_DURATION
@ AV_OPT_TYPE_DURATION
Underlying C type is int64_t.
Definition: opt.h:319
WhisperContext
Definition: af_whisper.c:41
ff_set_common_channel_layouts_from_list2
int ff_set_common_channel_layouts_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const AVChannelLayout *fmts)
Definition: formats.c:1025
WhisperContext::audio_buffer_queue_size
int audio_buffer_queue_size
Definition: af_whisper.c:64
WhisperContext::use_gpu
bool use_gpu
Definition: af_whisper.c:47
AVDictionary
Definition: dict.c:32
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:220
WhisperContext::avio_context
AVIOContext * avio_context
Definition: af_whisper.c:72
formats.h
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1515
whisper_class
static const AVClass whisper_class
Definition: af_whisper.c:477
ff_af_whisper
const FFFilter ff_af_whisper
Definition: af_whisper.c:484
samplefmt.h
run_transcription
static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
Definition: af_whisper.c:201
WhisperContext::vad_threshold
float vad_threshold
Definition: af_whisper.c:50
pts
static int64_t pts
Definition: transcode_aac.c:644
FILTER_QUERY_FUNC2
#define FILTER_QUERY_FUNC2(func)
Definition: filters.h:241
AV_DICT_DONT_STRDUP_VAL
#define AV_DICT_DONT_STRDUP_VAL
Take ownership of a value that's been allocated with av_malloc() or another memory allocation functio...
Definition: dict.h:79
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:205
init
static int init(AVFilterContext *ctx)
Definition: af_whisper.c:91
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:210
WhisperContext::gpu_device
int gpu_device
Definition: af_whisper.c:48
FFFilter
Definition: filters.h:267
WhisperContext::audio_buffer_start_ms
int64_t audio_buffer_start_ms
Definition: af_whisper.c:67
float
float
Definition: af_crystalizer.c:122
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:265
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:629
AVIO_FLAG_WRITE
#define AVIO_FLAG_WRITE
write-only
Definition: avio.h:618
ff_set_common_samplerates_from_list2
int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)
Definition: formats.c:1049
AV_LOG_DEBUG
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
Definition: log.h:231
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
av_rescale_q
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)
Rescale a 64-bit integer by 2 rational numbers.
Definition: mathematics.c:142
WhisperContext::max_len
int max_len
Definition: af_whisper.c:57
if
if(ret)
Definition: filter_design.txt:179
LIBAVUTIL_VERSION_INT
#define LIBAVUTIL_VERSION_INT
Definition: version.h:85
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:203
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:76
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
Definition: af_whisper.c:315
metadata
Stream codec metadata
Definition: ogg-flac-chained-meta.txt:2
NULL
#define NULL
Definition: coverity.c:32
format
New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format
Definition: swscale-v2.txt:14
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
av_strireplace
char * av_strireplace(const char *str, const char *from, const char *to)
Locale-independent strings replace.
Definition: avstring.c:230
av_strnlen
size_t static size_t av_strnlen(const char *s, size_t len)
Get the count of continuous non zero chars starting from the beginning.
Definition: avstring.h:141
av_default_item_name
const char * av_default_item_name(void *ptr)
Return the context name.
Definition: log.c:242
WhisperContext::audio_buffer_fill_size
int audio_buffer_fill_size
Definition: af_whisper.c:65
WhisperContext::vad_model_path
char * vad_model_path
Definition: af_whisper.c:49
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:34
HOURS
#define HOURS
Definition: af_whisper.c:458
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1462
AVOnce
#define AVOnce
Definition: thread.h:202
AVFilterFormatsConfig
Lists of formats / etc.
Definition: avfilter.h:121
WhisperContext::model_path
char * model_path
Definition: af_whisper.c:43
ff_inlink_queued_frames
size_t ff_inlink_queued_frames(AVFilterLink *link)
Get the number of frames available on the link.
Definition: avfilter.c:1478
av_log_level
static atomic_int av_log_level
Definition: log.c:59
WhisperContext::format
char * format
Definition: af_whisper.c:56
AVIOContext
Bytestream IO Context.
Definition: avio.h:160
activate
static int activate(AVFilterContext *ctx)
Definition: af_whisper.c:400
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
AVChannelLayout
An AVChannelLayout holds information about the channel layout of audio data.
Definition: channel_layout.h:319
WhisperContext::index
int index
Definition: af_whisper.c:73
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
av_err2str
#define av_err2str(errnum)
Convenience macro, the return value should be used only directly in function arguments but never stan...
Definition: error.h:122
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
avio.h
WhisperContext::eof
int eof
Definition: af_whisper.c:69
AV_NOPTS_VALUE
#define AV_NOPTS_VALUE
Undefined timestamp value.
Definition: avutil.h:247
user_data
static int FUNC() user_data(CodedBitstreamContext *ctx, RWContext *rw, MPEG2RawUserData *current)
Definition: cbs_mpeg2_syntax_template.c:59
avio_write
void avio_write(AVIOContext *s, const unsigned char *buf, int size)
Definition: aviobuf.c:206
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
WhisperContext::vad_min_silence_duration
int64_t vad_min_silence_duration
Definition: af_whisper.c:52
query_formats
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
Definition: af_whisper.c:436
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:221
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
Definition: opt.h:271
AVIOContext::direct
int direct
avio_read and avio_write should if possible be satisfied directly instead of going through a buffer,...
Definition: avio.h:268
internal.h
AV_TIME_BASE
#define AV_TIME_BASE
Internal time base represented as integer.
Definition: avutil.h:253
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:32
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:845
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
WhisperContext::language_str
char * language_str
Definition: af_whisper.c:45
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
OFFSET
#define OFFSET(x)
Definition: af_whisper.c:456
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
uninit
static void uninit(AVFilterContext *ctx)
Definition: af_whisper.c:175
cb_log
static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
Definition: af_whisper.c:76
ret
ret
Definition: filter_design.txt:187
AVClass::class_name
const char * class_name
The name of the class; usually it is the same name as the context structure type to which the AVClass...
Definition: log.h:81
frame
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
Definition: filter_design.txt:265
WhisperContext::audio_buffer
float * audio_buffer
Definition: af_whisper.c:63
FF_COUNT2LAYOUT
#define FF_COUNT2LAYOUT(c)
Encode a channel count as a channel layout.
Definition: formats.h:102
ff_set_sample_formats_from_list2
int ff_set_sample_formats_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const enum AVSampleFormat *fmts)
Definition: formats.c:1153
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:264
push_last_frame
static int push_last_frame(AVFilterLink *outlink)
Definition: af_whisper.c:374
status
ov_status_e status
Definition: dnn_backend_openvino.c:100
channel_layout.h
WhisperContext::ctx_vad
struct whisper_vad_context * ctx_vad
Definition: af_whisper.c:60
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Underlying C type is int.
Definition: opt.h:259
avfilter.h
WhisperContext::ctx_wsp
struct whisper_context * ctx_wsp
Definition: af_whisper.c:59
AVFILTER_FLAG_METADATA_ONLY
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Definition: avfilter.h:183
WhisperContext::translate
bool translate
Definition: af_whisper.c:46
WhisperContext::destination
char * destination
Definition: af_whisper.c:55
samples
Filter the word “frame” indicates either a video frame or a group of audio samples
Definition: filter_design.txt:8
AVFilterContext
An instance of a filter.
Definition: avfilter.h:274
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:639
AVIO_FLAG_DIRECT
#define AVIO_FLAG_DIRECT
Use direct mode.
Definition: avio.h:644
WhisperContext::next_pts
int64_t next_pts
Definition: af_whisper.c:70
FFFilter::p
AVFilter p
The public AVFilter.
Definition: filters.h:271
FLAGS
#define FLAGS
Definition: af_whisper.c:457
avutil.h
mem.h
audio.h
av_strdup
#define av_strdup(s)
Definition: ops_asmgen.c:47
AV_OPT_TYPE_BOOL
@ AV_OPT_TYPE_BOOL
Underlying C type is int.
Definition: opt.h:327
avio_closep
int avio_closep(AVIOContext **s)
Close the resource accessed by the AVIOContext *s, free it and set the pointer pointing to it to NULL...
Definition: avio.c:650
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:35
av_dict_set
int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags)
Set the given entry in *pm, overwriting an existing entry.
Definition: dict.c:86
WhisperContext::vad_min_speech_duration
int64_t vad_min_speech_duration
Definition: af_whisper.c:51
WhisperContext::vad_params
struct whisper_vad_params vad_params
Definition: af_whisper.c:61
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
WhisperContext::queue
int64_t queue
Definition: af_whisper.c:54
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276
input_data
static void input_data(MLPEncodeContext *ctx, MLPSubstream *s, uint8_t **const samples, int nb_samples)
Wrapper function for inputting data in two different bit-depths.
Definition: mlpenc.c:1219
AV_SAMPLE_FMT_FLT
@ AV_SAMPLE_FMT_FLT
float
Definition: samplefmt.h:60
duration
static int64_t duration
Definition: ffplay.c:329