sherpa_onnx

package module

v0.0.0-...-4d8d542 Latest Latest Go to latest Published: May 13, 2025 License: Apache-2.0 Imports: 2 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/0x5446/sherpa-onnx-go-macos

Links

Open Source Insights

README ¶

Introduction

This repo contains the Go package of sherpa-onnx for macOS, supporting x86_64 (Intel chips) as well as aarch64 (Apple Silicon, e.g., M1).

Documentation ¶

Rendered for

Overview ¶

Speech recognition with Next-gen Kaldi.

sherpa-onnx is an open-source speech recognition framework for Next-gen Kaldi. It depends only on onnxruntime, supporting both streaming and non-streaming speech recognition.

It does not need to access the network during recognition and everything runs locally.

It supports a variety of platforms, such as Linux (x86_64, aarch64, arm), Windows (x86_64, x86), macOS (x86_64, arm64), etc.

Usage examples:

Real-time speech recognition from a microphone
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/real-time-speech-recognition-from-microphone
Decode files using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-decode-files
Decode files using a streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files
Convert text to speech using a non-streaming model
Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts

Index ¶

func DeleteAudioTagging(tagging *AudioTagging)
func DeleteCircularBuffer(buffer *CircularBuffer)
func DeleteKeywordSpotter(spotter *KeywordSpotter)
func DeleteOfflinePunc(punc *OfflinePunctuation)
func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)
func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)
func DeleteOfflineSpeechDenoiser(sd *OfflineSpeechDenoiser)
func DeleteOfflineStream(stream *OfflineStream)
func DeleteOfflineTts(tts *OfflineTts)
func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)
func DeleteOnlineStream(stream *OnlineStream)
func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)
func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)
func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)
func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)
type AudioEvent
type AudioTagging
- func NewAudioTagging(config *AudioTaggingConfig) *AudioTagging
- func (tagging *AudioTagging) Compute(s *OfflineStream, topK int32) []AudioEvent
type AudioTaggingConfig
type AudioTaggingModelConfig
type CircularBuffer
- func NewCircularBuffer(capacity int) *CircularBuffer
- func (buffer *CircularBuffer) Get(start int, n int) []float32
- func (buffer *CircularBuffer) Head() int
- func (buffer *CircularBuffer) Pop(n int)
- func (buffer *CircularBuffer) Push(samples []float32)
- func (buffer *CircularBuffer) Reset()
- func (buffer *CircularBuffer) Size() int
type DenoisedAudio
- func (audio *DenoisedAudio) Save(filename string) bool
type FastClusteringConfig
type FeatureConfig
type GeneratedAudio
- func (audio *GeneratedAudio) Save(filename string) bool
type HomophoneReplacerConfig
type KeywordSpotter
- func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter
- func (spotter *KeywordSpotter) Decode(s *OnlineStream)
- func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult
- func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool
- func (spotter *KeywordSpotter) Reset(s *OnlineStream)
type KeywordSpotterConfig
type KeywordSpotterResult
type OfflineDolphinModelConfig
type OfflineFireRedAsrModelConfig
type OfflineLMConfig
type OfflineModelConfig
type OfflineMoonshineModelConfig
type OfflineNemoEncDecCtcModelConfig
type OfflineParaformerModelConfig
type OfflinePunctuation
- func NewOfflinePunctuation(config *OfflinePunctuationConfig) *OfflinePunctuation
- func (punc *OfflinePunctuation) AddPunct(text string) string
type OfflinePunctuationConfig
type OfflinePunctuationModelConfig
type OfflineRecognizer
- func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer
- func (recognizer *OfflineRecognizer) Decode(s *OfflineStream)
- func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream)
- func (r *OfflineRecognizer) SetConfig(config *OfflineRecognizerConfig)
type OfflineRecognizerConfig
type OfflineRecognizerResult
type OfflineSenseVoiceModelConfig
type OfflineSpeakerDiarization
- func NewOfflineSpeakerDiarization(config *OfflineSpeakerDiarizationConfig) *OfflineSpeakerDiarization
- func (sd *OfflineSpeakerDiarization) Process(samples []float32) []OfflineSpeakerDiarizationSegment
- func (sd *OfflineSpeakerDiarization) SampleRate() int
- func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig)
type OfflineSpeakerDiarizationConfig
type OfflineSpeakerDiarizationSegment
type OfflineSpeakerSegmentationModelConfig
type OfflineSpeakerSegmentationPyannoteModelConfig
type OfflineSpeechDenoiser
- func NewOfflineSpeechDenoiser(config *OfflineSpeechDenoiserConfig) *OfflineSpeechDenoiser
- func (sd *OfflineSpeechDenoiser) Run(samples []float32, sampleRate int) *DenoisedAudio
- func (sd *OfflineSpeechDenoiser) SampleRate() int
type OfflineSpeechDenoiserConfig
type OfflineSpeechDenoiserGtcrnModelConfig
type OfflineSpeechDenoiserModelConfig
type OfflineStream
- func NewAudioTaggingStream(tagging *AudioTagging) *OfflineStream
- func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream
- func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32)
- func (s *OfflineStream) GetResult() *OfflineRecognizerResult
type OfflineTdnnModelConfig
type OfflineTransducerModelConfig
type OfflineTts
- func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts
- func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio
type OfflineTtsConfig
type OfflineTtsKokoroModelConfig
type OfflineTtsMatchaModelConfig
type OfflineTtsModelConfig
type OfflineTtsVitsModelConfig
type OfflineWhisperModelConfig
type OfflineZipformerAudioTaggingModelConfig
type OnlineCtcFstDecoderConfig
type OnlineModelConfig
type OnlineParaformerModelConfig
type OnlineRecognizer
- func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer
- func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)
- func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)
- func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult
- func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool
- func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)
type OnlineRecognizerConfig
type OnlineRecognizerResult
type OnlineStream
- func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream
- func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream
- func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream
- func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32)
- func (s *OnlineStream) InputFinished()
type OnlineTransducerModelConfig
type OnlineZipformer2CtcModelConfig
type SileroVadModelConfig
type SpeakerEmbeddingExtractor
- func NewSpeakerEmbeddingExtractor(config *SpeakerEmbeddingExtractorConfig) *SpeakerEmbeddingExtractor
- func (ex *SpeakerEmbeddingExtractor) Compute(stream *OnlineStream) []float32
- func (ex *SpeakerEmbeddingExtractor) CreateStream() *OnlineStream
- func (ex *SpeakerEmbeddingExtractor) Dim() int
- func (ex *SpeakerEmbeddingExtractor) IsReady(stream *OnlineStream) bool
type SpeakerEmbeddingExtractorConfig
type SpeakerEmbeddingManager
- func NewSpeakerEmbeddingManager(dim int) *SpeakerEmbeddingManager
- func (m *SpeakerEmbeddingManager) AllSpeakers() []string
- func (m *SpeakerEmbeddingManager) Contains(name string) bool
- func (m *SpeakerEmbeddingManager) NumSpeakers() int
- func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool
- func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool
- func (m *SpeakerEmbeddingManager) Remove(name string) bool
- func (m *SpeakerEmbeddingManager) Search(embedding []float32, threshold float32) string
- func (m *SpeakerEmbeddingManager) Verify(name string, embedding []float32, threshold float32) bool
type SpeechSegment
type SpokenLanguageIdentification
- func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification
- func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult
- func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream
type SpokenLanguageIdentificationConfig
type SpokenLanguageIdentificationResult
type SpokenLanguageIdentificationWhisperConfig
type VadModelConfig
type VoiceActivityDetector
- func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector
- func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)
- func (vad *VoiceActivityDetector) Clear()
- func (vad *VoiceActivityDetector) Flush()
- func (vad *VoiceActivityDetector) Front() *SpeechSegment
- func (vad *VoiceActivityDetector) IsEmpty() bool
- func (vad *VoiceActivityDetector) IsSpeech() bool
- func (vad *VoiceActivityDetector) Pop()
- func (vad *VoiceActivityDetector) Reset()
type Wave
- func ReadWave(filename string) *Wave

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DeleteAudioTagging ¶

func DeleteAudioTagging(tagging *AudioTagging)

func DeleteCircularBuffer ¶

func DeleteCircularBuffer(buffer *CircularBuffer)

func DeleteKeywordSpotter ¶

func DeleteKeywordSpotter(spotter *KeywordSpotter)

Free the internal pointer inside the recognizer to avoid memory leak.

func DeleteOfflinePunc ¶

func DeleteOfflinePunc(punc *OfflinePunctuation)

func DeleteOfflineRecognizer ¶

func DeleteOfflineRecognizer(recognizer *OfflineRecognizer)

Frees the internal pointer of the recognition to avoid memory leak.

func DeleteOfflineSpeakerDiarization ¶

func DeleteOfflineSpeakerDiarization(sd *OfflineSpeakerDiarization)

func DeleteOfflineSpeechDenoiser ¶

func DeleteOfflineSpeechDenoiser(sd *OfflineSpeechDenoiser)

Free the internal pointer inside the OfflineSpeechDenoiser to avoid memory leak.

func DeleteOfflineStream ¶

func DeleteOfflineStream(stream *OfflineStream)

Frees the internal pointer of the stream to avoid memory leak.

func DeleteOfflineTts ¶

func DeleteOfflineTts(tts *OfflineTts)

Free the internal pointer inside the tts to avoid memory leak.

func DeleteOnlineRecognizer ¶

func DeleteOnlineRecognizer(recognizer *OnlineRecognizer)

Free the internal pointer inside the recognizer to avoid memory leak.

func DeleteOnlineStream ¶

func DeleteOnlineStream(stream *OnlineStream)

Delete the internal pointer inside the stream to avoid memory leak.

func DeleteSpeakerEmbeddingExtractor ¶

func DeleteSpeakerEmbeddingExtractor(ex *SpeakerEmbeddingExtractor)

func DeleteSpeakerEmbeddingManager ¶

func DeleteSpeakerEmbeddingManager(m *SpeakerEmbeddingManager)

func DeleteSpokenLanguageIdentification ¶

func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification)

func DeleteVoiceActivityDetector ¶

func DeleteVoiceActivityDetector(vad *VoiceActivityDetector)

Types ¶

type AudioEvent ¶

type AudioEvent struct {
	Name  string
	Index int
	Prob  float32
}

type AudioTagging ¶

type AudioTagging struct {
	// contains filtered or unexported fields
}

func NewAudioTagging ¶

func NewAudioTagging(config *AudioTaggingConfig) *AudioTagging

The user is responsible to invoke DeleteAudioTagging() to free the returned tagger to avoid memory leak

func (*AudioTagging) Compute ¶

func (tagging *AudioTagging) Compute(s *OfflineStream, topK int32) []AudioEvent

type AudioTaggingConfig ¶

type AudioTaggingConfig struct {
	Model  AudioTaggingModelConfig
	Labels string
	TopK   int32
}

type AudioTaggingModelConfig ¶

type AudioTaggingModelConfig struct {
	Zipformer  OfflineZipformerAudioTaggingModelConfig
	Ced        string
	NumThreads int32
	Debug      int32
	Provider   string
}

type CircularBuffer ¶

type CircularBuffer struct {
	// contains filtered or unexported fields
}

func NewCircularBuffer ¶

func NewCircularBuffer(capacity int) *CircularBuffer

func (*CircularBuffer) Get ¶

func (buffer *CircularBuffer) Get(start int, n int) []float32

func (*CircularBuffer) Head ¶

func (buffer *CircularBuffer) Head() int

func (*CircularBuffer) Pop ¶

func (buffer *CircularBuffer) Pop(n int)

func (*CircularBuffer) Push ¶

func (buffer *CircularBuffer) Push(samples []float32)

func (*CircularBuffer) Reset ¶

func (buffer *CircularBuffer) Reset()

func (*CircularBuffer) Size ¶

func (buffer *CircularBuffer) Size() int

type DenoisedAudio ¶

type DenoisedAudio struct {
	// Normalized samples in the range [-1, 1]
	Samples []float32

	SampleRate int
}

func (*DenoisedAudio) Save ¶

func (audio *DenoisedAudio) Save(filename string) bool

type FastClusteringConfig ¶

type FastClusteringConfig struct {
	NumClusters int
	Threshold   float32
}

type FeatureConfig ¶

type FeatureConfig struct {
	// Sample rate expected by the model. It is 16000 for all
	// pre-trained models provided by us
	SampleRate int
	// Feature dimension expected by the model. It is 80 for all
	// pre-trained models provided by us
	FeatureDim int
}

Configuration for the feature extractor

type GeneratedAudio ¶

type GeneratedAudio struct {
	// Normalized samples in the range [-1, 1]
	Samples []float32

	SampleRate int
}

func (*GeneratedAudio) Save ¶

func (audio *GeneratedAudio) Save(filename string) bool

type HomophoneReplacerConfig ¶

type HomophoneReplacerConfig struct {
	DictDir  string
	Lexicon  string
	RuleFsts string
}

type KeywordSpotter ¶

type KeywordSpotter struct {
	// contains filtered or unexported fields
}

func NewKeywordSpotter ¶

func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter

The user is responsible to invoke DeleteKeywordSpotter() to free the returned spotter to avoid memory leak

func (*KeywordSpotter) Decode ¶

func (spotter *KeywordSpotter) Decode(s *OnlineStream)

Decode the stream. Before calling this function, you have to ensure that spotter.IsReady(s) returns true. Otherwise, you will be SAD.

You usually use it like below:

for spotter.IsReady(s) {
  spotter.Decode(s)
}

func (*KeywordSpotter) GetResult ¶

func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult

Get the current result of stream since the last invoke of Reset()

func (*KeywordSpotter) IsReady ¶

func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool

Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.

You will usually use it like below:

for spotter.IsReady(s) {
   spotter.Decode(s)
}

func (*KeywordSpotter) Reset ¶

func (spotter *KeywordSpotter) Reset(s *OnlineStream)

You MUST call it right after detecting a keyword

type KeywordSpotterConfig ¶

type KeywordSpotterConfig struct {
	FeatConfig        FeatureConfig
	ModelConfig       OnlineModelConfig
	MaxActivePaths    int
	KeywordsFile      string
	KeywordsScore     float32
	KeywordsThreshold float32
	KeywordsBuf       string
	KeywordsBufSize   int
}

Configuration for the online/streaming recognizer.

type KeywordSpotterResult ¶

type KeywordSpotterResult struct {
	Keyword string
}

type OfflineDolphinModelConfig ¶

type OfflineDolphinModelConfig struct {
	Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}

type OfflineFireRedAsrModelConfig ¶

type OfflineFireRedAsrModelConfig struct {
	Encoder string
	Decoder string
}

type OfflineLMConfig ¶

type OfflineLMConfig struct {
	Model string  // Path to the model
	Scale float32 // scale for LM score
}

Configuration for offline LM.

type OfflineModelConfig ¶

type OfflineModelConfig struct {
	Transducer OfflineTransducerModelConfig
	Paraformer OfflineParaformerModelConfig
	NemoCTC    OfflineNemoEncDecCtcModelConfig
	Whisper    OfflineWhisperModelConfig
	Tdnn       OfflineTdnnModelConfig
	SenseVoice OfflineSenseVoiceModelConfig
	Moonshine  OfflineMoonshineModelConfig
	FireRedAsr OfflineFireRedAsrModelConfig
	Dolphin    OfflineDolphinModelConfig
	Tokens     string // Path to tokens.txt

	// Number of threads to use for neural network computation
	NumThreads int

	// 1 to print model meta information while loading
	Debug int

	// Optional. Valid values: cpu, cuda, coreml
	Provider string

	// Optional. Specify it for faster model initialization.
	ModelType string

	ModelingUnit  string // Optional. cjkchar, bpe, cjkchar+bpe
	BpeVocab      string // Optional.
	TeleSpeechCtc string // Optional.
}

type OfflineMoonshineModelConfig ¶

type OfflineMoonshineModelConfig struct {
	Preprocessor    string
	Encoder         string
	UncachedDecoder string
	CachedDecoder   string
}

type OfflineNemoEncDecCtcModelConfig ¶

type OfflineNemoEncDecCtcModelConfig struct {
	Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}

Configuration for offline/non-streaming NeMo CTC models.

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html to download pre-trained models

type OfflineParaformerModelConfig ¶

type OfflineParaformerModelConfig struct {
	Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}

Configuration for offline/non-streaming paraformer.

please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html to download pre-trained models

type OfflinePunctuation ¶

type OfflinePunctuation struct {
	// contains filtered or unexported fields
}

func NewOfflinePunctuation ¶

func NewOfflinePunctuation(config *OfflinePunctuationConfig) *OfflinePunctuation

func (*OfflinePunctuation) AddPunct ¶

func (punc *OfflinePunctuation) AddPunct(text string) string

type OfflinePunctuationConfig ¶

type OfflinePunctuationConfig struct {
	Model OfflinePunctuationModelConfig
}

type OfflinePunctuationModelConfig ¶

type OfflinePunctuationModelConfig struct {
	CtTransformer string
	NumThreads    C.int
	Debug         C.int // true to print debug information of the model
	Provider      string
}

============================================================ For punctuation ============================================================

type OfflineRecognizer ¶

type OfflineRecognizer struct {
	// contains filtered or unexported fields
}

It wraps a pointer from C

func NewOfflineRecognizer ¶

func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer

The user is responsible to invoke DeleteOfflineRecognizer() to free the returned recognizer to avoid memory leak

func (*OfflineRecognizer) Decode ¶

func (recognizer *OfflineRecognizer) Decode(s *OfflineStream)

Decode the offline stream.

func (*OfflineRecognizer) DecodeStreams ¶

func (recognizer *OfflineRecognizer) DecodeStreams(s []*OfflineStream)

Decode multiple streams in parallel, i.e., in batch.

func (*OfflineRecognizer) SetConfig ¶

func (r *OfflineRecognizer) SetConfig(config *OfflineRecognizerConfig)

Set new config to replace

type OfflineRecognizerConfig ¶

type OfflineRecognizerConfig struct {
	FeatConfig  FeatureConfig
	ModelConfig OfflineModelConfig
	LmConfig    OfflineLMConfig

	// Valid decoding method: greedy_search, modified_beam_search
	DecodingMethod string

	// Used only when DecodingMethod is modified_beam_search.
	MaxActivePaths int
	HotwordsFile   string
	HotwordsScore  float32
	BlankPenalty   float32
	RuleFsts       string
	RuleFars       string
	Hr             HomophoneReplacerConfig
}

Configuration for the offline/non-streaming recognizer.

type OfflineRecognizerResult ¶

type OfflineRecognizerResult struct {
	Text       string
	Tokens     []string
	Timestamps []float32
	Lang       string
	Emotion    string
	Event      string
}

It contains recognition result of an offline stream.

type OfflineSenseVoiceModelConfig ¶

type OfflineSenseVoiceModelConfig struct {
	Model                       string
	Language                    string
	UseInverseTextNormalization int
}

type OfflineSpeakerDiarization ¶

type OfflineSpeakerDiarization struct {
	// contains filtered or unexported fields
}

func NewOfflineSpeakerDiarization ¶

func NewOfflineSpeakerDiarization(config *OfflineSpeakerDiarizationConfig) *OfflineSpeakerDiarization

func (*OfflineSpeakerDiarization) Process ¶

func (sd *OfflineSpeakerDiarization) Process(samples []float32) []OfflineSpeakerDiarizationSegment

func (*OfflineSpeakerDiarization) SampleRate ¶

func (sd *OfflineSpeakerDiarization) SampleRate() int

func (*OfflineSpeakerDiarization) SetConfig ¶

func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig)

only config.Clustering is used. All other fields are ignored

type OfflineSpeakerDiarizationConfig ¶

type OfflineSpeakerDiarizationConfig struct {
	Segmentation   OfflineSpeakerSegmentationModelConfig
	Embedding      SpeakerEmbeddingExtractorConfig
	Clustering     FastClusteringConfig
	MinDurationOn  float32
	MinDurationOff float32
}

type OfflineSpeakerDiarizationSegment ¶

type OfflineSpeakerDiarizationSegment struct {
	Start   float32
	End     float32
	Speaker int
}

type OfflineSpeakerSegmentationModelConfig ¶

type OfflineSpeakerSegmentationModelConfig struct {
	Pyannote   OfflineSpeakerSegmentationPyannoteModelConfig
	NumThreads int
	Debug      int
	Provider   string
}

type OfflineSpeakerSegmentationPyannoteModelConfig ¶

type OfflineSpeakerSegmentationPyannoteModelConfig struct {
	Model string
}

============================================================ For offline speaker diarization ============================================================

type OfflineSpeechDenoiser ¶

type OfflineSpeechDenoiser struct {
	// contains filtered or unexported fields
}

func NewOfflineSpeechDenoiser ¶

func NewOfflineSpeechDenoiser(config *OfflineSpeechDenoiserConfig) *OfflineSpeechDenoiser

The user is responsible to invoke DeleteOfflineSpeechDenoiser() to free the returned tts to avoid memory leak

func (*OfflineSpeechDenoiser) Run ¶

func (sd *OfflineSpeechDenoiser) Run(samples []float32, sampleRate int) *DenoisedAudio

func (*OfflineSpeechDenoiser) SampleRate ¶

func (sd *OfflineSpeechDenoiser) SampleRate() int

type OfflineSpeechDenoiserConfig ¶

type OfflineSpeechDenoiserConfig struct {
	Model OfflineSpeechDenoiserModelConfig
}

type OfflineSpeechDenoiserGtcrnModelConfig ¶

type OfflineSpeechDenoiserGtcrnModelConfig struct {
	Model string
}

type OfflineSpeechDenoiserModelConfig ¶

type OfflineSpeechDenoiserModelConfig struct {
	Gtcrn      OfflineSpeechDenoiserGtcrnModelConfig
	NumThreads int32
	Debug      int32
	Provider   string
}

type OfflineStream ¶

type OfflineStream struct {
	// contains filtered or unexported fields
}

It wraps a pointer from C

func NewAudioTaggingStream ¶

func NewAudioTaggingStream(tagging *AudioTagging) *OfflineStream

The user is responsible to invoke DeleteOfflineStream() to free the returned stream to avoid memory leak

func NewOfflineStream ¶

func NewOfflineStream(recognizer *OfflineRecognizer) *OfflineStream

The user is responsible to invoke DeleteOfflineStream() to free the returned stream to avoid memory leak

func (*OfflineStream) AcceptWaveform ¶

func (s *OfflineStream) AcceptWaveform(sampleRate int, samples []float32)

Input audio samples for the offline stream. Please only call it once. That is, input all samples at once.

sampleRate is the sample rate of the input audio samples. If it is different from the value expected by the feature extractor, we will do resampling inside.

samples contains the actual audio samples. Each sample is in the range [-1, 1].

func (*OfflineStream) GetResult ¶

func (s *OfflineStream) GetResult() *OfflineRecognizerResult

Get the recognition result of the offline stream.

type OfflineTdnnModelConfig ¶

type OfflineTdnnModelConfig struct {
	Model string
}

type OfflineTransducerModelConfig ¶

type OfflineTransducerModelConfig struct {
	Encoder string // Path to the encoder model, i.e., encoder.onnx or encoder.int8.onnx
	Decoder string // Path to the decoder model
	Joiner  string // Path to the joiner model
}

Configuration for offline/non-streaming transducer.

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html to download pre-trained models

type OfflineTts ¶

type OfflineTts struct {
	// contains filtered or unexported fields
}

The offline tts class. It wraps a pointer from C.

func NewOfflineTts ¶

func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts

The user is responsible to invoke DeleteOfflineTts() to free the returned tts to avoid memory leak

func (*OfflineTts) Generate ¶

func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio

type OfflineTtsConfig ¶

type OfflineTtsConfig struct {
	Model           OfflineTtsModelConfig
	RuleFsts        string
	RuleFars        string
	MaxNumSentences int
	SilenceScale    float32
}

type OfflineTtsKokoroModelConfig ¶

type OfflineTtsKokoroModelConfig struct {
	Model       string  // Path to the model for kokoro
	Voices      string  // Path to the voices.bin for kokoro
	Tokens      string  // Path to tokens.txt
	DataDir     string  // Path to espeak-ng-data directory
	DictDir     string  // Path to dict directory
	Lexicon     string  // Path to lexicon files
	LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
}

type OfflineTtsMatchaModelConfig ¶

type OfflineTtsMatchaModelConfig struct {
	AcousticModel string  // Path to the acoustic model for MatchaTTS
	Vocoder       string  // Path to the vocoder model for MatchaTTS
	Lexicon       string  // Path to lexicon.txt
	Tokens        string  // Path to tokens.txt
	DataDir       string  // Path to espeak-ng-data directory
	NoiseScale    float32 // noise scale for vits models. Please use 0.667 in general
	LengthScale   float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
	DictDir       string  // Path to dict directory for jieba (used only in Chinese tts)
}

type OfflineTtsModelConfig ¶

type OfflineTtsModelConfig struct {
	Vits   OfflineTtsVitsModelConfig
	Matcha OfflineTtsMatchaModelConfig
	Kokoro OfflineTtsKokoroModelConfig

	// Number of threads to use for neural network computation
	NumThreads int

	// 1 to print model meta information while loading
	Debug int

	// Optional. Valid values: cpu, cuda, coreml
	Provider string
}

type OfflineTtsVitsModelConfig ¶

type OfflineTtsVitsModelConfig struct {
	Model       string  // Path to the VITS onnx model
	Lexicon     string  // Path to lexicon.txt
	Tokens      string  // Path to tokens.txt
	DataDir     string  // Path to espeak-ng-data directory
	NoiseScale  float32 // noise scale for vits models. Please use 0.667 in general
	NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general
	LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
	DictDir     string  // Path to dict directory for jieba (used only in Chinese tts)
}

Configuration for offline/non-streaming text-to-speech (TTS).

Please refer to https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html to download pre-trained models

type OfflineWhisperModelConfig ¶

type OfflineWhisperModelConfig struct {
	Encoder      string
	Decoder      string
	Language     string
	Task         string
	TailPaddings int
}

type OfflineZipformerAudioTaggingModelConfig ¶

type OfflineZipformerAudioTaggingModelConfig struct {
	Model string
}

Configuration for the audio tagging.

type OnlineCtcFstDecoderConfig ¶

type OnlineCtcFstDecoderConfig struct {
	Graph     string
	MaxActive int
}

type OnlineModelConfig ¶

type OnlineModelConfig struct {
	Transducer    OnlineTransducerModelConfig
	Paraformer    OnlineParaformerModelConfig
	Zipformer2Ctc OnlineZipformer2CtcModelConfig
	Tokens        string // Path to tokens.txt
	NumThreads    int    // Number of threads to use for neural network computation
	Provider      string // Optional. Valid values are: cpu, cuda, coreml
	Debug         int    // 1 to show model meta information while loading it.
	ModelType     string // Optional. You can specify it for faster model initialization
	ModelingUnit  string // Optional. cjkchar, bpe, cjkchar+bpe
	BpeVocab      string // Optional.
	TokensBuf     string // Optional.
	TokensBufSize int    // Optional.
}

Configuration for online/streaming models

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models

type OnlineParaformerModelConfig ¶

type OnlineParaformerModelConfig struct {
	Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
	Decoder string // Path to the decoder model.
}

Configuration for online/streaming paraformer models

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html to download pre-trained models

type OnlineRecognizer ¶

type OnlineRecognizer struct {
	// contains filtered or unexported fields
}

The online recognizer class. It wraps a pointer from C.

func NewOnlineRecognizer ¶

func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer

The user is responsible to invoke DeleteOnlineRecognizer() to free the returned recognizer to avoid memory leak

func (*OnlineRecognizer) Decode ¶

func (recognizer *OnlineRecognizer) Decode(s *OnlineStream)

Decode the stream. Before calling this function, you have to ensure that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.

You usually use it like below:

for recognizer.IsReady(s) {
  recognizer.Decode(s)
}

func (*OnlineRecognizer) DecodeStreams ¶

func (recognizer *OnlineRecognizer) DecodeStreams(s []*OnlineStream)

Decode multiple streams in parallel, i.e., in batch. You have to ensure that each stream is ready for decoding. Otherwise, you will be SAD.

func (*OnlineRecognizer) GetResult ¶

func (recognizer *OnlineRecognizer) GetResult(s *OnlineStream) *OnlineRecognizerResult

Get the current result of stream since the last invoke of Reset()

func (*OnlineRecognizer) IsEndpoint ¶

func (recognizer *OnlineRecognizer) IsEndpoint(s *OnlineStream) bool

Return true if an endpoint is detected.

You usually use it like below:

if recognizer.IsEndpoint(s) {
   // do your own stuff after detecting an endpoint

   recognizer.Reset(s)
}

func (*OnlineRecognizer) IsReady ¶

func (recognizer *OnlineRecognizer) IsReady(s *OnlineStream) bool

Check whether the stream has enough feature frames for decoding. Return true if this stream is ready for decoding. Return false otherwise.

You will usually use it like below:

for recognizer.IsReady(s) {
   recognizer.Decode(s)
}

func (*OnlineRecognizer) Reset ¶

func (recognizer *OnlineRecognizer) Reset(s *OnlineStream)

After calling this function, the internal neural network model states are reset and IsEndpoint(s) would return false. GetResult(s) would also return an empty string.

type OnlineRecognizerConfig ¶

type OnlineRecognizerConfig struct {
	FeatConfig  FeatureConfig
	ModelConfig OnlineModelConfig

	// Valid decoding methods: greedy_search, modified_beam_search
	DecodingMethod string

	// Used only when DecodingMethod is modified_beam_search. It specifies
	// the maximum number of paths to keep during the search
	MaxActivePaths int

	EnableEndpoint int // 1 to enable endpoint detection.

	// Please see
	// https://k2-fsa.github.io/sherpa/ncnn/endpoint.html
	// for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence
	// and Rule3MinUtteranceLength.
	Rule1MinTrailingSilence float32
	Rule2MinTrailingSilence float32
	Rule3MinUtteranceLength float32
	HotwordsFile            string
	HotwordsScore           float32
	BlankPenalty            float32
	CtcFstDecoderConfig     OnlineCtcFstDecoderConfig
	RuleFsts                string
	RuleFars                string
	HotwordsBuf             string
	HotwordsBufSize         int
	Hr                      HomophoneReplacerConfig
}

Configuration for the online/streaming recognizer.

type OnlineRecognizerResult ¶

type OnlineRecognizerResult struct {
	Text string
}

It contains the recognition result for a online stream.

type OnlineStream ¶

type OnlineStream struct {
	// contains filtered or unexported fields
}

The online stream class. It wraps a pointer from C.

func NewKeywordStream ¶

func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream

The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak

func NewKeywordStreamWithKeywords ¶

func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream

The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak

func NewOnlineStream ¶

func NewOnlineStream(recognizer *OnlineRecognizer) *OnlineStream

The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak

func (*OnlineStream) AcceptWaveform ¶

func (s *OnlineStream) AcceptWaveform(sampleRate int, samples []float32)

Input audio samples for the stream.

sampleRate is the actual sample rate of the input audio samples. If it is different from the sample rate expected by the feature extractor, we will do resampling inside.

samples contains audio samples. Each sample is in the range [-1, 1]

func (*OnlineStream) InputFinished ¶

func (s *OnlineStream) InputFinished()

Signal that there will be no incoming audio samples. After calling this function, you cannot call OnlineStream.AcceptWaveform any longer.

The main purpose of this function is to flush the remaining audio samples buffered inside for feature extraction.

type OnlineTransducerModelConfig ¶

type OnlineTransducerModelConfig struct {
	Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
	Decoder string // Path to the decoder model.
	Joiner  string // Path to the joiner model.
}

Configuration for online/streaming transducer models

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html to download pre-trained models

type OnlineZipformer2CtcModelConfig ¶

type OnlineZipformer2CtcModelConfig struct {
	Model string // Path to the onnx model
}

Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html to download pre-trained models

type SileroVadModelConfig ¶

type SileroVadModelConfig struct {
	Model              string
	Threshold          float32
	MinSilenceDuration float32
	MinSpeechDuration  float32
	WindowSize         int
	MaxSpeechDuration  float32
}

============================================================ For VAD ============================================================

type SpeakerEmbeddingExtractor ¶

type SpeakerEmbeddingExtractor struct {
	// contains filtered or unexported fields
}

func NewSpeakerEmbeddingExtractor ¶

func NewSpeakerEmbeddingExtractor(config *SpeakerEmbeddingExtractorConfig) *SpeakerEmbeddingExtractor

The user has to invoke DeleteSpeakerEmbeddingExtractor() to free the returned value to avoid memory leak

func (*SpeakerEmbeddingExtractor) Compute ¶

func (ex *SpeakerEmbeddingExtractor) Compute(stream *OnlineStream) []float32

func (*SpeakerEmbeddingExtractor) CreateStream ¶

func (ex *SpeakerEmbeddingExtractor) CreateStream() *OnlineStream

The user is responsible to invoke DeleteOnlineStream() to free the returned stream to avoid memory leak

func (*SpeakerEmbeddingExtractor) Dim ¶

func (ex *SpeakerEmbeddingExtractor) Dim() int

func (*SpeakerEmbeddingExtractor) IsReady ¶

func (ex *SpeakerEmbeddingExtractor) IsReady(stream *OnlineStream) bool

type SpeakerEmbeddingExtractorConfig ¶

type SpeakerEmbeddingExtractorConfig struct {
	Model      string
	NumThreads int
	Debug      int
	Provider   string
}

type SpeakerEmbeddingManager ¶

type SpeakerEmbeddingManager struct {
	// contains filtered or unexported fields
}

func NewSpeakerEmbeddingManager ¶

func NewSpeakerEmbeddingManager(dim int) *SpeakerEmbeddingManager

The user has to invoke DeleteSpeakerEmbeddingManager() to free the returned value to avoid memory leak

func (*SpeakerEmbeddingManager) AllSpeakers ¶

func (m *SpeakerEmbeddingManager) AllSpeakers() []string

func (*SpeakerEmbeddingManager) Contains ¶

func (m *SpeakerEmbeddingManager) Contains(name string) bool

func (*SpeakerEmbeddingManager) NumSpeakers ¶

func (m *SpeakerEmbeddingManager) NumSpeakers() int

func (*SpeakerEmbeddingManager) Register ¶

func (m *SpeakerEmbeddingManager) Register(name string, embedding []float32) bool

func (*SpeakerEmbeddingManager) RegisterV ¶

func (m *SpeakerEmbeddingManager) RegisterV(name string, embeddings [][]float32) bool

func (*SpeakerEmbeddingManager) Remove ¶

func (m *SpeakerEmbeddingManager) Remove(name string) bool

func (*SpeakerEmbeddingManager) Search ¶

func (m *SpeakerEmbeddingManager) Search(embedding []float32, threshold float32) string

func (*SpeakerEmbeddingManager) Verify ¶

func (m *SpeakerEmbeddingManager) Verify(name string, embedding []float32, threshold float32) bool

type SpeechSegment ¶

type SpeechSegment struct {
	Start   int
	Samples []float32
}

type SpokenLanguageIdentification ¶

type SpokenLanguageIdentification struct {
	// contains filtered or unexported fields
}

func NewSpokenLanguageIdentification ¶

func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification

func (*SpokenLanguageIdentification) Compute ¶

func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult

func (*SpokenLanguageIdentification) CreateStream ¶

func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream

The user has to invoke DeleteOfflineStream() to free the returned value to avoid memory leak

type SpokenLanguageIdentificationConfig ¶

type SpokenLanguageIdentificationConfig struct {
	Whisper    SpokenLanguageIdentificationWhisperConfig
	NumThreads int
	Debug      int
	Provider   string
}

type SpokenLanguageIdentificationResult ¶

type SpokenLanguageIdentificationResult struct {
	Lang string
}

type SpokenLanguageIdentificationWhisperConfig ¶

type SpokenLanguageIdentificationWhisperConfig struct {
	Encoder      string
	Decoder      string
	TailPaddings int
}

type VadModelConfig ¶

type VadModelConfig struct {
	SileroVad  SileroVadModelConfig
	SampleRate int
	NumThreads int
	Provider   string
	Debug      int
}

type VoiceActivityDetector ¶

type VoiceActivityDetector struct {
	// contains filtered or unexported fields
}

func NewVoiceActivityDetector ¶

func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector

func (*VoiceActivityDetector) AcceptWaveform ¶

func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32)

func (*VoiceActivityDetector) Clear ¶

func (vad *VoiceActivityDetector) Clear()

func (*VoiceActivityDetector) Flush ¶

func (vad *VoiceActivityDetector) Flush()

func (*VoiceActivityDetector) Front ¶

func (vad *VoiceActivityDetector) Front() *SpeechSegment

func (*VoiceActivityDetector) IsEmpty ¶

func (vad *VoiceActivityDetector) IsEmpty() bool

func (*VoiceActivityDetector) IsSpeech ¶

func (vad *VoiceActivityDetector) IsSpeech() bool

func (*VoiceActivityDetector) Pop ¶

func (vad *VoiceActivityDetector) Pop()

func (*VoiceActivityDetector) Reset ¶

func (vad *VoiceActivityDetector) Reset()

type Wave ¶

type Wave = GeneratedAudio

single channel wave

func ReadWave ¶

func ReadWave(filename string) *Wave

Source Files ¶

View all Source files

sherpa_onnx.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL