Documentation ¶
Overview ¶
Speech recognition with Next-gen Kaldi.
sherpa-onnx is an open-source speech recognition framework for Next-gen Kaldi. It depends only on onnxruntime, supporting both streaming and non-streaming speech recognition.
It does not need to access the network during recognition and everything runs locally.
It supports a variety of platforms, such as Linux (x86_64, aarch64, arm), Windows (x86_64, x86), macOS (x86_64, arm64), etc.
Usage examples:
Real-time speech recognition from a microphone
Decode files using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-decode-files
Decode files using a streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files
Convert text to speech using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts
Index ¶
- func DeleteCircularBuffer(buffer *CircularBuffer)
- func DeleteKeywordSpotter(spotter *KeywordSpotter)
- func DeleteOfflinePunc(punc *OfflinePunctuation)
- func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)
- func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)
- func DeleteOfflineStream(stream *OfflineStream)
- func DeleteOfflineTts(tts *OfflineTts)
- func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)
- func DeleteOnlineStream(stream *OnlineStream)
- func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)
- func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)
- func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)
- func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)
- type CircularBuffer
- type FastClusteringConfig
- type FeatureConfig
- type GeneratedAudio
- type KeywordSpotter
- type KeywordSpotterConfig
- type KeywordSpotterResult
- type OfflineLMConfig
- type OfflineModelConfig
- type OfflineMoonshineModelConfig
- type OfflineNemoEncDecCtcModelConfig
- type OfflineParaformerModelConfig
- type OfflinePunctuation
- type OfflinePunctuationConfig
- type OfflinePunctuationModelConfig
- type OfflineRecognizer
- type OfflineRecognizerConfig
- type OfflineRecognizerResult
- type OfflineSenseVoiceModelConfig
- type OfflineSpeakerDiarization
- type OfflineSpeakerDiarizationConfig
- type OfflineSpeakerDiarizationSegment
- type OfflineSpeakerSegmentationModelConfig
- type OfflineSpeakerSegmentationPyannoteModelConfig
- type OfflineStream
- type OfflineTdnnModelConfig
- type OfflineTransducerModelConfig
- type OfflineTts
- type OfflineTtsConfig
- type OfflineTtsMatchaModelConfig
- type OfflineTtsModelConfig
- type OfflineTtsVitsModelConfig
- type OfflineWhisperModelConfig
- type OnlineCtcFstDecoderConfig
- type OnlineModelConfig
- type OnlineParaformerModelConfig
- type OnlineRecognizer
- func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)
- func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)
- func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult
- func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)
- type OnlineRecognizerConfig
- type OnlineRecognizerResult
- type OnlineStream
- type OnlineTransducerModelConfig
- type OnlineZipformer2CtcModelConfig
- type SileroVadModelConfig
- type SpeakerEmbeddingExtractor
- type SpeakerEmbeddingExtractorConfig
- type SpeakerEmbeddingManager
- func (m *SpeakerEmbeddingManager) AllSpeakers() []string
- func (m *SpeakerEmbeddingManager) Contains(name string) bool
- func (m *SpeakerEmbeddingManager) NumSpeakers() int
- func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool
- func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool
- func (m *SpeakerEmbeddingManager) Remove(name string) bool
- func (m *SpeakerEmbeddingManager) Search(embedding []float32, threshold float32) string
- func (m *SpeakerEmbeddingManager) Verify(name string, embedding []float32, threshold float32) bool
- type SpeechSegment
- type SpokenLanguageIdentification
- type SpokenLanguageIdentificationConfig
- type SpokenLanguageIdentificationResult
- type SpokenLanguageIdentificationWhisperConfig
- type VadModelConfig
- type VoiceActivityDetector
- func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)
- func (vad *VoiceActivityDetector) Clear()
- func (vad *VoiceActivityDetector) Flush()
- func (vad *VoiceActivityDetector) Front() *SpeechSegment
- func (vad *VoiceActivityDetector) IsEmpty() bool
- func (vad *VoiceActivityDetector) IsSpeech() bool
- func (vad *VoiceActivityDetector) Pop()
- func (vad *VoiceActivityDetector) Reset()
- type Wave
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DeleteCircularBuffer ¶ added in v1.9.15
func DeleteCircularBuffer(buffer *CircularBuffer)
func DeleteKeywordSpotter ¶ added in v1.10.37
func DeleteKeywordSpotter(spotter *KeywordSpotter)
Free the internal pointer inside the recognizer to avoid memory leak.
func DeleteOfflinePunc ¶ added in v1.10.29
func DeleteOfflinePunc(punc *OfflinePunctuation)
func DeleteOfflineRecognizer ¶
func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)
Frees the internal pointer of the recognition to avoid memory leak.
func DeleteOfflineSpeakerDiarization ¶ added in v1.10.28
func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)
func DeleteOfflineStream ¶
func DeleteOfflineStream(stream *OfflineStream)
Frees the internal pointer of the stream to avoid memory leak.
func DeleteOfflineTts ¶ added in v1.8.4
func DeleteOfflineTts(tts *OfflineTts)
Free the internal pointer inside the tts to avoid memory leak.
func DeleteOnlineRecognizer ¶
func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)
Free the internal pointer inside the recognizer to avoid memory leak.
func DeleteOnlineStream ¶
func DeleteOnlineStream(stream *OnlineStream)
Delete the internal pointer inside the stream to avoid memory leak.
func DeleteSpeakerEmbeddingExtractor ¶ added in v1.9.15
func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)
func DeleteSpeakerEmbeddingManager ¶ added in v1.9.15
func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)
func DeleteSpokenLanguageIdentification ¶ added in v1.9.15
func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)
func DeleteVoiceActivityDetector ¶ added in v1.9.15
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)
Types ¶
type CircularBuffer ¶ added in v1.9.15
type CircularBuffer struct {
// contains filtered or unexported fields
}
func NewCircularBuffer ¶ added in v1.9.15
func NewCircularBuffer(capacity int) *CircularBuffer
func (*CircularBuffer) Get ¶ added in v1.9.15
func (buffer *CircularBuffer) Get(start int, n int) []float32
func (*CircularBuffer) Head ¶ added in v1.9.15
func (buffer *CircularBuffer) Head() int
func (*CircularBuffer) Pop ¶ added in v1.9.15
func (buffer *CircularBuffer) Pop(n int)
func (*CircularBuffer) Push ¶ added in v1.9.15
func (buffer *CircularBuffer) Push(samples []float32)
func (*CircularBuffer) Reset ¶ added in v1.9.15
func (buffer *CircularBuffer) Reset()
func (*CircularBuffer) Size ¶ added in v1.9.15
func (buffer *CircularBuffer) Size() int
type FastClusteringConfig ¶ added in v1.10.28
type FeatureConfig ¶
type FeatureConfig struct { // Sample rate expected by the model. It is 16000 for all // pre-trained models provided by us SampleRate int // Feature dimension expected by the model. It is 80 for all // pre-trained models provided by us FeatureDim int }
Configuration for the feature extractor
type GeneratedAudio ¶ added in v1.8.4
type GeneratedAudio struct { // Normalized samples in the range [-1, 1] Samples []float32 SampleRate int }
func (*GeneratedAudio) Save ¶ added in v1.8.4
func (audio *GeneratedAudio) Save(filename string) bool
type KeywordSpotter ¶ added in v1.10.37
type KeywordSpotter struct {
// contains filtered or unexported fields
}
func NewKeywordSpotter ¶ added in v1.10.37
func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter
The user is responsible to invoke DeleteKeywordSpotter() to free the returned spotter to avoid memory leak
func (*KeywordSpotter) Decode ¶ added in v1.10.37
func (spotter *KeywordSpotter) Decode(s *OnlineStream)
Decode the stream. Before calling this function, you have to ensure that spotter.IsReady(s) returns true. Otherwise, you will be SAD.
You usually use it like below:
for spotter.IsReady(s) { spotter.Decode(s) }
func (*KeywordSpotter) GetResult ¶ added in v1.10.37
func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult
Get the current result of stream since the last invoke of Reset()
func (*KeywordSpotter) IsReady ¶ added in v1.10.37
func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool
Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.
You will usually use it like below:
for spotter.IsReady(s) { spotter.Decode(s) }
type KeywordSpotterConfig ¶ added in v1.10.37
type KeywordSpotterConfig struct { FeatConfig FeatureConfig ModelConfig OnlineModelConfig MaxActivePaths int KeywordsFile string KeywordsScore float32 KeywordsThreshold float32 KeywordsBuf string KeywordsBufSize int }
Configuration for the online/streaming recognizer.
type KeywordSpotterResult ¶ added in v1.10.37
type KeywordSpotterResult struct {
Keyword string
}
type OfflineLMConfig ¶
type OfflineLMConfig struct { Model string // Path to the model Scale float32 // scale for LM score }
Configuration for offline LM.
type OfflineModelConfig ¶
type OfflineModelConfig struct { Transducer OfflineTransducerModelConfig Paraformer OfflineParaformerModelConfig NemoCTC OfflineNemoEncDecCtcModelConfig Whisper OfflineWhisperModelConfig Tdnn OfflineTdnnModelConfig SenseVoice OfflineSenseVoiceModelConfig Moonshine OfflineMoonshineModelConfig Tokens string // Path to tokens.txt // Number of threads to use for neural network computation NumThreads int // 1 to print model meta information while loading Debug int // Optional. Valid values: cpu, cuda, coreml Provider string // Optional. Specify it for faster model initialization. ModelType string ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe BpeVocab string // Optional. TeleSpeechCtc string // Optional. }
type OfflineMoonshineModelConfig ¶ added in v1.10.30
type OfflineNemoEncDecCtcModelConfig ¶
type OfflineNemoEncDecCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
Configuration for offline/non-streaming NeMo CTC models.
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html to download pre-trained models
type OfflineParaformerModelConfig ¶
type OfflineParaformerModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
Configuration for offline/non-streaming paraformer.
please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html to download pre-trained models
type OfflinePunctuation ¶ added in v1.10.29
type OfflinePunctuation struct {
// contains filtered or unexported fields
}
func NewOfflinePunctuation ¶ added in v1.10.29
func NewOfflinePunctuation(config *OfflinePunctuationConfig) *OfflinePunctuation
func (*OfflinePunctuation) AddPunct ¶ added in v1.10.29
func (punc *OfflinePunctuation) AddPunct(text string) string
type OfflinePunctuationConfig ¶ added in v1.10.29
type OfflinePunctuationConfig struct {
Model OfflinePunctuationModelConfig
}
type OfflinePunctuationModelConfig ¶ added in v1.10.29
type OfflinePunctuationModelConfig struct { CtTransformer string NumThreads C.int Debug C.int // true to print debug information of the model Provider string }
============================================================ For punctuation ============================================================
type OfflineRecognizer ¶
type OfflineRecognizer struct {
// contains filtered or unexported fields
}
It wraps a pointer from C
func NewOfflineRecognizer ¶
func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer
The user is responsible to invoke DeleteOfflineRecognizer() to free the returned recognizer to avoid memory leak
func (*OfflineRecognizer) Decode ¶
func (recognizer *OfflineRecognizer) Decode(s *OfflineStream)
Decode the offline stream.
func (*OfflineRecognizer) DecodeStreams ¶
func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream)
Decode multiple streams in parallel, i.e., in batch.
type OfflineRecognizerConfig ¶
type OfflineRecognizerConfig struct { FeatConfig FeatureConfig ModelConfig OfflineModelConfig LmConfig OfflineLMConfig // Valid decoding method: greedy_search, modified_beam_search DecodingMethod string // Used only when DecodingMethod is modified_beam_search. MaxActivePaths int HotwordsFile string HotwordsScore float32 BlankPenalty float32 RuleFsts string RuleFars string }
Configuration for the offline/non-streaming recognizer.
type OfflineRecognizerResult ¶
type OfflineRecognizerResult struct { Text string Tokens []string Timestamps []float32 Lang string Emotion string Event string }
It contains recognition result of an offline stream.
type OfflineSenseVoiceModelConfig ¶ added in v1.10.17
type OfflineSpeakerDiarization ¶ added in v1.10.28
type OfflineSpeakerDiarization struct {
// contains filtered or unexported fields
}
func NewOfflineSpeakerDiarization ¶ added in v1.10.28
func NewOfflineSpeakerDiarization(config *OfflineSpeakerDiarizationConfig) *OfflineSpeakerDiarization
func (*OfflineSpeakerDiarization) Process ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) Process(samples []float32) []OfflineSpeakerDiarizationSegment
func (*OfflineSpeakerDiarization) SampleRate ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) SampleRate() int
func (*OfflineSpeakerDiarization) SetConfig ¶ added in v1.10.28
func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig)
only config.Clustering is used. All other fields are ignored
type OfflineSpeakerDiarizationConfig ¶ added in v1.10.28
type OfflineSpeakerDiarizationConfig struct { Segmentation OfflineSpeakerSegmentationModelConfig Embedding SpeakerEmbeddingExtractorConfig Clustering FastClusteringConfig MinDurationOn float32 MinDurationOff float32 }
type OfflineSpeakerDiarizationSegment ¶ added in v1.10.28
type OfflineSpeakerSegmentationModelConfig ¶ added in v1.10.28
type OfflineSpeakerSegmentationModelConfig struct { Pyannote OfflineSpeakerSegmentationPyannoteModelConfig NumThreads int Debug int Provider string }
type OfflineSpeakerSegmentationPyannoteModelConfig ¶ added in v1.10.28
type OfflineSpeakerSegmentationPyannoteModelConfig struct {
Model string
}
============================================================ For offline speaker diarization ============================================================
type OfflineStream ¶
type OfflineStream struct {
// contains filtered or unexported fields
}
It wraps a pointer from C
func NewOfflineStream ¶
func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream
The user is responsible to invoke DeleteOfflineStream() to free the returned stream to avoid memory leak
func (*OfflineStream) AcceptWaveform ¶
func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32)
Input audio samples for the offline stream. Please only call it once. That is, input all samples at once.
sampleRate is the sample rate of the input audio samples. If it is different from the value expected by the feature extractor, we will do resampling inside.
samples contains the actual audio samples. Each sample is in the range [-1, 1].
func (*OfflineStream) GetResult ¶
func (s *OfflineStream) GetResult() *OfflineRecognizerResult
Get the recognition result of the offline stream.
type OfflineTdnnModelConfig ¶ added in v1.7.8
type OfflineTdnnModelConfig struct {
Model string
}
type OfflineTransducerModelConfig ¶
type OfflineTransducerModelConfig struct { Encoder string // Path to the encoder model, i.e., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model Joiner string // Path to the joiner model }
Configuration for offline/non-streaming transducer.
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html to download pre-trained models
type OfflineTts ¶ added in v1.8.4
type OfflineTts struct {
// contains filtered or unexported fields
}
The offline tts class. It wraps a pointer from C.
func NewOfflineTts ¶ added in v1.8.4
func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts
The user is responsible to invoke DeleteOfflineTts() to free the returned tts to avoid memory leak
func (*OfflineTts) Generate ¶ added in v1.8.4
func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio
type OfflineTtsConfig ¶ added in v1.8.4
type OfflineTtsConfig struct { Model OfflineTtsModelConfig RuleFsts string RuleFars string MaxNumSentences int }
type OfflineTtsMatchaModelConfig ¶ added in v1.10.38
type OfflineTtsMatchaModelConfig struct { AcousticModel string // Path to the acoustic model for MatchaTTS Vocoder string // Path to the vocoder model for MatchaTTS Lexicon string // Path to lexicon.txt Tokens string // Path to tokens.txt DataDir string // Path to espeak-ng-data directory NoiseScale float32 // noise scale for vits models. Please use 0.667 in general LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed DictDir string // Path to dict directory for jieba (used only in Chinese tts) }
type OfflineTtsModelConfig ¶ added in v1.8.4
type OfflineTtsModelConfig struct { Vits OfflineTtsVitsModelConfig Matcha OfflineTtsMatchaModelConfig // Number of threads to use for neural network computation NumThreads int // 1 to print model meta information while loading Debug int // Optional. Valid values: cpu, cuda, coreml Provider string }
type OfflineTtsVitsModelConfig ¶ added in v1.8.4
type OfflineTtsVitsModelConfig struct { Model string // Path to the VITS onnx model Lexicon string // Path to lexicon.txt Tokens string // Path to tokens.txt DataDir string // Path to espeak-ng-data directory NoiseScale float32 // noise scale for vits models. Please use 0.667 in general NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed DictDir string // Path to dict directory for jieba (used only in Chinese tts) }
Configuration for offline/non-streaming text-to-speech (TTS).
Please refer to https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html to download pre-trained models
type OfflineWhisperModelConfig ¶ added in v1.7.8
type OnlineCtcFstDecoderConfig ¶ added in v1.9.16
type OnlineModelConfig ¶ added in v1.7.6
type OnlineModelConfig struct { Transducer OnlineTransducerModelConfig Paraformer OnlineParaformerModelConfig Zipformer2Ctc OnlineZipformer2CtcModelConfig Tokens string // Path to tokens.txt NumThreads int // Number of threads to use for neural network computation Provider string // Optional. Valid values are: cpu, cuda, coreml Debug int // 1 to show model meta information while loading it. ModelType string // Optional. You can specify it for faster model initialization ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe BpeVocab string // Optional. TokensBuf string // Optional. TokensBufSize int // Optional. }
Configuration for online/streaming models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models
type OnlineParaformerModelConfig ¶ added in v1.7.6
type OnlineParaformerModelConfig struct { Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model. }
Configuration for online/streaming paraformer models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models
type OnlineRecognizer ¶
type OnlineRecognizer struct {
// contains filtered or unexported fields
}
The online recognizer class. It wraps a pointer from C.
func NewOnlineRecognizer ¶
func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer
The user is responsible to invoke DeleteOnlineRecognizer() to free the returned recognizer to avoid memory leak
func (*OnlineRecognizer) Decode ¶
func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)
Decode the stream. Before calling this function, you have to ensure that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.
You usually use it like below:
for recognizer.IsReady(s) { recognizer.Decode(s) }
func (*OnlineRecognizer) DecodeStreams ¶
func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)
Decode multiple streams in parallel, i.e., in batch. You have to ensure that each stream is ready for decoding. Otherwise, you will be SAD.
func (*OnlineRecognizer) GetResult ¶
func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult
Get the current result of stream since the last invoke of Reset()
func (*OnlineRecognizer) IsEndpoint ¶
func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool
Return true if an endpoint is detected.
You usually use it like below:
if recognizer.IsEndpoint(s) { // do your own stuff after detecting an endpoint recognizer.Reset(s) }
func (*OnlineRecognizer) IsReady ¶
func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool
Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.
You will usually use it like below:
for recognizer.IsReady(s) { recognizer.Decode(s) }
func (*OnlineRecognizer) Reset ¶
func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)
After calling this function, the internal neural network model states are reset and IsEndpoint(s) would return false. GetResult(s) would also return an empty string.
type OnlineRecognizerConfig ¶
type OnlineRecognizerConfig struct { FeatConfig FeatureConfig ModelConfig OnlineModelConfig // Valid decoding methods: greedy_search, modified_beam_search DecodingMethod string // Used only when DecodingMethod is modified_beam_search. It specifies // the maximum number of paths to keep during the search MaxActivePaths int EnableEndpoint int // 1 to enable endpoint detection. // Please see // https://k2-fsa.github.io/sherpa/ncnn/endpoint.html // for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence // and Rule3MinUtteranceLength. Rule1MinTrailingSilence float32 Rule2MinTrailingSilence float32 Rule3MinUtteranceLength float32 HotwordsFile string HotwordsScore float32 BlankPenalty float32 CtcFstDecoderConfig OnlineCtcFstDecoderConfig RuleFsts string RuleFars string HotwordsBuf string HotwordsBufSize int }
Configuration for the online/streaming recognizer.
type OnlineRecognizerResult ¶
type OnlineRecognizerResult struct {
Text string
}
It contains the recognition result for a online stream.
type OnlineStream ¶
type OnlineStream struct {
// contains filtered or unexported fields
}
The online stream class. It wraps a pointer from C.
func NewKeywordStream ¶ added in v1.10.37
func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func NewKeywordStreamWithKeywords ¶ added in v1.10.37
func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func NewOnlineStream ¶
func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func (*OnlineStream) AcceptWaveform ¶
func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32)
Input audio samples for the stream.
sampleRate is the actual sample rate of the input audio samples. If it is different from the sample rate expected by the feature extractor, we will do resampling inside.
samples contains audio samples. Each sample is in the range [-1, 1]
func (*OnlineStream) InputFinished ¶
func (s *OnlineStream) InputFinished()
Signal that there will be no incoming audio samples. After calling this function, you cannot call OnlineStream.AcceptWaveform any longer.
The main purpose of this function is to flush the remaining audio samples buffered inside for feature extraction.
type OnlineTransducerModelConfig ¶
type OnlineTransducerModelConfig struct { Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx Decoder string // Path to the decoder model. Joiner string // Path to the joiner model. }
Configuration for online/streaming transducer models
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html to download pre-trained models
type OnlineZipformer2CtcModelConfig ¶ added in v1.9.7
type OnlineZipformer2CtcModelConfig struct {
Model string // Path to the onnx model
}
Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html to download pre-trained models
type SileroVadModelConfig ¶ added in v1.9.15
type SileroVadModelConfig struct { Model string Threshold float32 MinSilenceDuration float32 MinSpeechDuration float32 WindowSize int MaxSpeechDuration float32 }
============================================================ For VAD ============================================================
type SpeakerEmbeddingExtractor ¶ added in v1.9.15
type SpeakerEmbeddingExtractor struct {
// contains filtered or unexported fields
}
func NewSpeakerEmbeddingExtractor ¶ added in v1.9.15
func NewSpeakerEmbeddingExtractor(config *SpeakerEmbeddingExtractorConfig) *SpeakerEmbeddingExtractor
The user has to invoke DeleteSpeakerEmbeddingExtractor() to free the returned value to avoid memory leak
func (*SpeakerEmbeddingExtractor) Compute ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) Compute(stream *OnlineStream) []float32
func (*SpeakerEmbeddingExtractor) CreateStream ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) CreateStream() *OnlineStream
The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak
func (*SpeakerEmbeddingExtractor) Dim ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) Dim() int
func (*SpeakerEmbeddingExtractor) IsReady ¶ added in v1.9.15
func (ex *SpeakerEmbeddingExtractor) IsReady(stream *OnlineStream) bool
type SpeakerEmbeddingExtractorConfig ¶ added in v1.9.15
type SpeakerEmbeddingManager ¶ added in v1.9.15
type SpeakerEmbeddingManager struct {
// contains filtered or unexported fields
}
func NewSpeakerEmbeddingManager ¶ added in v1.9.15
func NewSpeakerEmbeddingManager(dim int) *SpeakerEmbeddingManager
The user has to invoke DeleteSpeakerEmbeddingManager() to free the returned value to avoid memory leak
func (*SpeakerEmbeddingManager) AllSpeakers ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) AllSpeakers() []string
func (*SpeakerEmbeddingManager) Contains ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Contains(name string) bool
func (*SpeakerEmbeddingManager) NumSpeakers ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) NumSpeakers() int
func (*SpeakerEmbeddingManager) Register ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool
func (*SpeakerEmbeddingManager) RegisterV ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool
func (*SpeakerEmbeddingManager) Remove ¶ added in v1.9.15
func (m *SpeakerEmbeddingManager) Remove(name string) bool
type SpeechSegment ¶ added in v1.9.15
type SpokenLanguageIdentification ¶ added in v1.9.15
type SpokenLanguageIdentification struct {
// contains filtered or unexported fields
}
func NewSpokenLanguageIdentification ¶ added in v1.9.15
func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification
func (*SpokenLanguageIdentification) Compute ¶ added in v1.9.15
func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult
func (*SpokenLanguageIdentification) CreateStream ¶ added in v1.9.15
func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream
The user has to invoke DeleteOfflineStream() to free the returned value to avoid memory leak
type SpokenLanguageIdentificationConfig ¶ added in v1.9.15
type SpokenLanguageIdentificationConfig struct { Whisper SpokenLanguageIdentificationWhisperConfig NumThreads int Debug int Provider string }
type SpokenLanguageIdentificationResult ¶ added in v1.9.15
type SpokenLanguageIdentificationResult struct {
Lang string
}
type SpokenLanguageIdentificationWhisperConfig ¶ added in v1.9.15
type VadModelConfig ¶ added in v1.9.15
type VadModelConfig struct { SileroVad SileroVadModelConfig SampleRate int NumThreads int Provider string Debug int }
type VoiceActivityDetector ¶ added in v1.9.15
type VoiceActivityDetector struct {
// contains filtered or unexported fields
}
func NewVoiceActivityDetector ¶ added in v1.9.15
func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector
func (*VoiceActivityDetector) AcceptWaveform ¶ added in v1.9.15
func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)
func (*VoiceActivityDetector) Clear ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Clear()
func (*VoiceActivityDetector) Flush ¶ added in v1.10.13
func (vad *VoiceActivityDetector) Flush()
func (*VoiceActivityDetector) Front ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Front() *SpeechSegment
func (*VoiceActivityDetector) IsEmpty ¶ added in v1.9.15
func (vad *VoiceActivityDetector) IsEmpty() bool
func (*VoiceActivityDetector) IsSpeech ¶ added in v1.9.15
func (vad *VoiceActivityDetector) IsSpeech() bool
func (*VoiceActivityDetector) Pop ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Pop()
func (*VoiceActivityDetector) Reset ¶ added in v1.9.15
func (vad *VoiceActivityDetector) Reset()