Go to the documentation of this file.
78 case GGML_LOG_LEVEL_ERROR:
81 case GGML_LOG_LEVEL_WARN:
103 struct whisper_context_params params = whisper_context_default_params();
104 params.use_gpu = wctx->
use_gpu;
121 struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
127 wctx->
vad_params = whisper_vad_default_params();
140 if (!strcmp(
"-",
dst))
153 "Whisper filter initialized: model: %s lang: %s queue: %ld ms\n",
165 "Remaining audio buffer %d samples (%d seconds) after stopping\n",
170 whisper_vad_free(wctx->
ctx_vad);
197 "run transcription at %ld ms, %d/%d samples (%.2f seconds)...\n",
200 struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
203 params.print_special = 0;
204 params.print_progress = 0;
205 params.print_realtime = 0;
206 params.print_timestamps = 0;
213 const int n_segments = whisper_full_n_segments(wctx->
ctx_wsp);
214 char *segments_text =
NULL;
216 for (
int i = 0;
i < n_segments; ++
i) {
217 const char *text = whisper_full_get_segment_text(wctx->
ctx_wsp,
i);
227 const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->
ctx_wsp,
i);
228 const int64_t t0_ms = whisper_full_get_segment_t0(wctx->
ctx_wsp,
i) * 10;
229 const int64_t t1_ms = whisper_full_get_segment_t1(wctx->
ctx_wsp,
i) * 10;
232 timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ?
" (turn)" :
"", text_cleaned);
235 char *new_text =
av_asprintf(
"%s%s", segments_text, text_cleaned);
237 segments_text = new_text;
242 const int64_t start_t = timestamp_ms + t0_ms;
243 const int64_t end_t = timestamp_ms + t1_ms;
249 (
"%d\n%02ld:%02ld:%02ld,%03ld --> %02ld:%02ld:%02ld,%03ld\n%s\n\n",
250 wctx->
index, start_t / 3600000,
251 (start_t / 60000) % 60, (start_t / 1000) % 60,
252 start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
253 (end_t / 1000) % 60, end_t % 1000, text_cleaned);
255 buf =
av_asprintf(
"{\"start\":%ld,\"end\":%ld,\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
303 (
AVRational) {inlink->time_base.den, inlink->time_base.num});
310 struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->
ctx_vad,
319 int n_segments = whisper_vad_segments_n_segments(segments);
321 if (n_segments > 0) {
322 const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
323 const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
324 int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
326 if (end_pos <= wctx->audio_buffer_fill_size -
329 "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
335 whisper_vad_free_segments(segments);
428 #define OFFSET(x) offsetof(WhisperContext, x)
429 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
430 #define HOURS 3600000000
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
#define AV_LOG_WARNING
Something somehow does not look correct.
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
static const AVOption whisper_options[]
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
static enum AVSampleFormat sample_fmts[]
#define AVERROR_EOF
End of file.
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
char * av_asprintf(const char *fmt,...)
int av_strcasecmp(const char *a, const char *b)
Locale-independent case-insensitive compare.
static av_const int av_isspace(int c)
Locale-independent conversion of ASCII isspace.
#define FILTER_INPUTS(array)
static const int sample_rates[]
This structure describes decoded (raw) audio or video data.
int audio_buffer_vad_size
int av_samples_set_silence(uint8_t *const *audio_data, int offset, int nb_samples, int nb_channels, enum AVSampleFormat sample_fmt)
Fill an audio buffer with silence.
int avio_open(AVIOContext **s, const char *filename, int flags)
Create and initialize a AVIOContext for accessing the resource indicated by url.
@ AV_OPT_TYPE_DURATION
Underlying C type is int64_t.
int audio_buffer_queue_size
const char * name
Filter name.
A link between two filters.
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
AVIOContext * avio_context
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
static const AVClass whisper_class
const FFFilter ff_af_whisper
static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
#define AV_DICT_DONT_STRDUP_VAL
Take ownership of a value that's been allocated with av_malloc() or another memory allocation functio...
static int ff_thread_once(char *control, void(*routine)(void))
static int init(AVFilterContext *ctx)
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
int64_t audio_buffer_start_ms
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
#define AVIO_FLAG_WRITE
write-only
#define AV_LOG_DEBUG
Stuff which is only useful for libav* developers.
int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)
Rescale a 64-bit integer by 2 rational numbers.
#define FILTER_OUTPUTS(array)
#define LIBAVUTIL_VERSION_INT
Describe the class of an AVClass context structure.
static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format
Rational number (pair of numerator and denominator).
char * av_strireplace(const char *str, const char *from, const char *to)
Locale-independent strings replace.
size_t static size_t av_strnlen(const char *s, size_t len)
Get the count of continuous non zero chars starting from the beginning.
const char * av_default_item_name(void *ptr)
Return the context name.
int audio_buffer_fill_size
const AVFilterPad ff_audio_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
size_t ff_inlink_queued_frames(AVFilterLink *link)
Get the number of frames available on the link.
static int activate(AVFilterContext *ctx)
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
An AVChannelLayout holds information about the channel layout of audio data.
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
#define av_err2str(errnum)
Convenience macro, the return value should be used only directly in function arguments but never stan...
#define AV_NOPTS_VALUE
Undefined timestamp value.
static int FUNC() user_data(CodedBitstreamContext *ctx, RWContext *rw, MPEG2RawUserData *current)
AVFilterContext * src
source filter
void avio_write(AVIOContext *s, const unsigned char *buf, int size)
FF_FILTER_FORWARD_WANTED(outlink, inlink)
int64_t vad_min_silence_duration
static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)
#define AV_LOG_INFO
Standard information.
@ AV_OPT_TYPE_FLOAT
Underlying C type is float.
int sample_rate
samples per second
int direct
avio_read and avio_write should if possible be satisfied directly instead of going through a buffer,...
#define i(width, name, range_min, range_max)
#define AV_TIME_BASE
Internal time base represented as integer.
#define av_malloc_array(a, b)
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
AVSampleFormat
Audio sample formats.
#define FILTER_QUERY_FUNC2(func)
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
static void uninit(AVFilterContext *ctx)
Undefined Behavior In the C language
static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
const char * class_name
The name of the class; usually it is the same name as the context structure type to which the AVClass...
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
static int push_last_frame(AVFilterLink *outlink)
struct whisper_vad_context * ctx_vad
@ AV_OPT_TYPE_INT
Underlying C type is int.
struct whisper_context * ctx_wsp
#define AVFILTER_FLAG_METADATA_ONLY
The filter is a "metadata" filter - it does not modify the frame data in any way.
Filter the word “frame” indicates either a video frame or a group of audio samples
AVRational time_base
Define the time base used by the PTS of the frames/samples which will pass through this link.
#define AVIO_FLAG_DIRECT
Use direct mode.
char * av_strdup(const char *s)
Duplicate a string.
AVFilter p
The public AVFilter.
@ AV_OPT_TYPE_BOOL
Underlying C type is int.
int avio_closep(AVIOContext **s)
Close the resource accessed by the AVIOContext *s, free it and set the pointer pointing to it to NULL...
int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags)
Set the given entry in *pm, overwriting an existing entry.
int64_t vad_min_speech_duration
struct whisper_vad_params vad_params
@ AV_OPT_TYPE_STRING
Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...
static void input_data(MLPEncodeContext *ctx, MLPSubstream *s, uint8_t **const samples, int nb_samples)
Wrapper function for inputting data in two different bit-depths.