FFmpeg
Data Structures | Macros | Functions | Variables
af_whisper.c File Reference
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <whisper.h>
#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/channel_layout.h"
#include "libavutil/samplefmt.h"
#include "libavfilter/avfilter.h"
#include "libavfilter/audio.h"
#include "libavutil/mem.h"
#include "libavutil/avstring.h"
#include "libavutil/internal.h"
#include "libavformat/avio.h"
#include "libavutil/thread.h"
#include "formats.h"

Go to the source code of this file.

Data Structures

struct  WhisperContext
 

Macros

#define OFFSET(x)   offsetof(WhisperContext, x)
 
#define FLAGS   AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
 
#define HOURS   3600000000
 

Functions

static void cb_log (enum ggml_log_level level, const char *text, void *user_data)
 
static int init (AVFilterContext *ctx)
 
static void uninit (AVFilterContext *ctx)
 
static void run_transcription (AVFilterContext *ctx, AVFrame *frame, int samples)
 
static int filter_frame (AVFilterLink *inlink, AVFrame *frame)
 
static int push_last_frame (AVFilterLink *outlink)
 
static int activate (AVFilterContext *ctx)
 
static int query_formats (const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
 

Variables

static const AVOption whisper_options []
 
static const AVClass whisper_class
 
const FFFilter ff_af_whisper
 

Macro Definition Documentation

◆ OFFSET

#define OFFSET (   x)    offsetof(WhisperContext, x)

Definition at line 428 of file af_whisper.c.

◆ FLAGS

Definition at line 429 of file af_whisper.c.

◆ HOURS

#define HOURS   3600000000

Definition at line 430 of file af_whisper.c.

Function Documentation

◆ cb_log()

static void cb_log ( enum ggml_log_level  level,
const char *  text,
void *  user_data 
)
static

Definition at line 73 of file af_whisper.c.

Referenced by init().

◆ init()

static int init ( AVFilterContext ctx)
static

Definition at line 88 of file af_whisper.c.

◆ uninit()

static void uninit ( AVFilterContext ctx)
static

Definition at line 159 of file af_whisper.c.

◆ run_transcription()

static void run_transcription ( AVFilterContext ctx,
AVFrame frame,
int  samples 
)
static

Definition at line 185 of file af_whisper.c.

Referenced by filter_frame(), and push_last_frame().

◆ filter_frame()

static int filter_frame ( AVFilterLink inlink,
AVFrame frame 
)
static

Definition at line 287 of file af_whisper.c.

Referenced by activate().

◆ push_last_frame()

static int push_last_frame ( AVFilterLink outlink)
static

Definition at line 346 of file af_whisper.c.

Referenced by activate().

◆ activate()

static int activate ( AVFilterContext ctx)
static

Definition at line 372 of file af_whisper.c.

◆ query_formats()

static int query_formats ( const AVFilterContext ctx,
AVFilterFormatsConfig **  cfg_in,
AVFilterFormatsConfig **  cfg_out 
)
static

Definition at line 408 of file af_whisper.c.

Variable Documentation

◆ whisper_options

const AVOption whisper_options[]
static
Initial value:
= {
{ "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
{ "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
{ "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
{ "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
{ "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
{ "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
{ "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
{ "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
{ "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
{ "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
{ "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
{ NULL }
}

Definition at line 432 of file af_whisper.c.

◆ whisper_class

const AVClass whisper_class
static
Initial value:
= {
.class_name = "whisper",
.item_name = av_default_item_name,
.option = whisper_options,
}

Definition at line 447 of file af_whisper.c.

◆ ff_af_whisper

const FFFilter ff_af_whisper
Initial value:
= {
.p.name = "whisper",
.p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
.p.priv_class = &whisper_class,
.init = init,
.uninit = uninit,
.activate = activate,
.priv_size = sizeof(WhisperContext),
}

Definition at line 454 of file af_whisper.c.

whisper_options
static const AVOption whisper_options[]
Definition: af_whisper.c:432
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: filters.h:263
AV_OPT_TYPE_DURATION
@ AV_OPT_TYPE_DURATION
Underlying C type is int64_t.
Definition: opt.h:319
WhisperContext
Definition: af_whisper.c:41
whisper_class
static const AVClass whisper_class
Definition: af_whisper.c:447
init
static int init(AVFilterContext *ctx)
Definition: af_whisper.c:88
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: filters.h:264
LIBAVUTIL_VERSION_INT
#define LIBAVUTIL_VERSION_INT
Definition: version.h:85
NULL
#define NULL
Definition: coverity.c:32
format
New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format
Definition: swscale-v2.txt:14
av_default_item_name
const char * av_default_item_name(void *ptr)
Return the context name.
Definition: log.c:241
ff_audio_default_filterpad
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
Definition: audio.c:34
HOURS
#define HOURS
Definition: af_whisper.c:430
activate
static int activate(AVFilterContext *ctx)
Definition: af_whisper.c:372
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
query_formats
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
Definition: af_whisper.c:408
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
Definition: opt.h:271
FILTER_QUERY_FUNC2
#define FILTER_QUERY_FUNC2(func)
Definition: filters.h:240
OFFSET
#define OFFSET(x)
Definition: af_whisper.c:428
uninit
static void uninit(AVFilterContext *ctx)
Definition: af_whisper.c:159
language
Undefined Behavior In the C language
Definition: undefined.txt:3
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Underlying C type is int.
Definition: opt.h:259
AVFILTER_FLAG_METADATA_ONLY
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Definition: avfilter.h:183
FLAGS
#define FLAGS
Definition: af_whisper.c:429
AV_OPT_TYPE_BOOL
@ AV_OPT_TYPE_BOOL
Underlying C type is int.
Definition: opt.h:327
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
Definition: opt.h:276