Documentation
¶
Overview ¶
Package stt provides OmniVoice-based speech-to-text for marp2video.
Package stt provides OmniVoice-based speech-to-text for marp2video. This file re-exports the omnivoice subtitle package for convenience.
Index ¶
- Constants
- func GenerateSRTFromResult(result *TranscriptionResult, opts SubtitleOptions) string
- func GenerateVTTFromResult(result *TranscriptionResult, opts SubtitleOptions) string
- func SaveSRTFromResult(result *TranscriptionResult, filePath string, opts SubtitleOptions) error
- func SaveVTTFromResult(result *TranscriptionResult, filePath string, opts SubtitleOptions) error
- type Factory
- type Provider
- func (p *Provider) Name() string
- func (p *Provider) Transcribe(ctx context.Context, audio []byte, config TranscriptionConfig) (*TranscriptionResult, error)
- func (p *Provider) TranscribeFile(ctx context.Context, filePath string, config TranscriptionConfig) (*TranscriptionResult, error)
- func (p *Provider) TranscribeURL(ctx context.Context, url string, config TranscriptionConfig) (*TranscriptionResult, error)
- func (p *Provider) UnderlyingProvider() omnivoice.STTProvider
- type ProviderConfig
- type Segment
- type SubtitleFormat
- type SubtitleOptions
- type TranscriptionConfig
- type TranscriptionResult
- type Word
Constants ¶
const ( FormatSRT = omnivoice.SubtitleFormatSRT FormatVTT = omnivoice.SubtitleFormatVTT )
Format constants.
Variables ¶
This section is empty.
Functions ¶
func GenerateSRTFromResult ¶
func GenerateSRTFromResult(result *TranscriptionResult, opts SubtitleOptions) string
GenerateSRTFromResult generates SRT from a local TranscriptionResult. For direct omnivoice results, use omnivoice.GenerateSRT instead.
func GenerateVTTFromResult ¶
func GenerateVTTFromResult(result *TranscriptionResult, opts SubtitleOptions) string
GenerateVTTFromResult generates VTT from a local TranscriptionResult. For direct omnivoice results, use omnivoice.GenerateVTT instead.
func SaveSRTFromResult ¶
func SaveSRTFromResult(result *TranscriptionResult, filePath string, opts SubtitleOptions) error
SaveSRTFromResult saves SRT from a local TranscriptionResult.
func SaveVTTFromResult ¶
func SaveVTTFromResult(result *TranscriptionResult, filePath string, opts SubtitleOptions) error
SaveVTTFromResult saves VTT from a local TranscriptionResult.
Types ¶
type Factory ¶
type Factory struct {
// contains filtered or unexported fields
}
Factory creates STT providers based on configuration.
func NewFactory ¶
func NewFactory(config ProviderConfig) *Factory
NewFactory creates a new STT provider factory.
func (*Factory) Available ¶
Available returns a list of available provider names based on configured API keys.
func (*Factory) Get ¶
Get returns a provider by name, creating it if necessary. If name is empty, returns the fallback provider.
func (*Factory) SetFallback ¶
SetFallback sets the default provider name.
type Provider ¶
type Provider struct {
// contains filtered or unexported fields
}
Provider wraps an OmniVoice STT provider for use with marp2video.
func New ¶
func New(provider omnivoice.STTProvider) *Provider
New creates a new OmniVoice STT provider wrapper.
func (*Provider) Transcribe ¶
func (p *Provider) Transcribe(ctx context.Context, audio []byte, config TranscriptionConfig) (*TranscriptionResult, error)
Transcribe transcribes audio bytes and returns the result.
func (*Provider) TranscribeFile ¶
func (p *Provider) TranscribeFile(ctx context.Context, filePath string, config TranscriptionConfig) (*TranscriptionResult, error)
TranscribeFile transcribes an audio file and returns the result.
func (*Provider) TranscribeURL ¶
func (p *Provider) TranscribeURL(ctx context.Context, url string, config TranscriptionConfig) (*TranscriptionResult, error)
TranscribeURL transcribes audio from a URL and returns the result.
func (*Provider) UnderlyingProvider ¶
func (p *Provider) UnderlyingProvider() omnivoice.STTProvider
UnderlyingProvider returns the wrapped OmniVoice provider for advanced operations.
type ProviderConfig ¶
type ProviderConfig struct {
// ElevenLabsAPIKey is the API key for ElevenLabs.
ElevenLabsAPIKey string
// DeepgramAPIKey is the API key for Deepgram.
DeepgramAPIKey string
}
ProviderConfig holds configuration for creating STT providers.
type Segment ¶
type Segment struct {
// Text is the transcribed text for this segment.
Text string
// StartTime is when the segment starts.
StartTime time.Duration
// EndTime is when the segment ends.
EndTime time.Duration
// Confidence is the average confidence for this segment.
Confidence float64
// Speaker is the speaker identifier (if diarization enabled).
Speaker string
// Words contains word-level details.
Words []Word
}
Segment represents a segment of transcription.
type SubtitleFormat ¶
type SubtitleFormat = omnivoice.SubtitleFormat
SubtitleFormat is an alias for omnivoice.SubtitleFormat.
type SubtitleOptions ¶
type SubtitleOptions = omnivoice.SubtitleOptions
SubtitleOptions is an alias for omnivoice.SubtitleOptions.
func DefaultSubtitleOptions ¶
func DefaultSubtitleOptions() SubtitleOptions
DefaultSubtitleOptions returns sensible defaults for subtitle generation.
type TranscriptionConfig ¶
type TranscriptionConfig struct {
// Language is the BCP-47 language code (e.g., "en-US").
Language string
// Model is the provider-specific model identifier.
Model string
// EnablePunctuation adds punctuation to transcription.
EnablePunctuation bool
// EnableWordTimestamps includes word-level timestamps.
EnableWordTimestamps bool
// EnableSpeakerDiarization identifies different speakers.
EnableSpeakerDiarization bool
// MaxSpeakers is the maximum number of speakers to detect.
MaxSpeakers int
}
TranscriptionConfig configures a transcription request.
type TranscriptionResult ¶
type TranscriptionResult struct {
// Text is the full transcription text.
Text string
// Segments contains segment-level details with timing.
Segments []Segment
// Language is the detected language.
Language string
// Duration is the audio duration.
Duration time.Duration
}
TranscriptionResult contains the result of a transcription.
type Word ¶
type Word struct {
// Text is the transcribed word.
Text string
// StartTime is when the word starts.
StartTime time.Duration
// EndTime is when the word ends.
EndTime time.Duration
// Confidence is the recognition confidence (0.0 to 1.0).
Confidence float64
// Speaker is the speaker identifier.
Speaker string
}
Word represents a single transcribed word with timing.