Documentation
¶
Overview ¶
Package media provides Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for the GAI framework, enabling voice-based AI interactions.
Example (AudioFormatConversion) ¶
Example demonstrates audio format conversion
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/recera/gai/media"
)
func main() {
tts := media.NewCartesia(
media.WithCartesiaAPIKey(os.Getenv("CARTESIA_API_KEY")),
)
ctx := context.Background()
formats := []string{
media.FormatMP3,
media.FormatWAV,
media.FormatOGG,
media.FormatFLAC,
}
text := "Testing audio format conversion."
for _, format := range formats {
stream, err := tts.Synthesize(ctx, media.SpeechRequest{
Text: text,
Format: format,
})
if err != nil {
log.Printf("Failed to synthesize %s: %v", format, err)
continue
}
// Get format info
audioFormat := stream.Format()
fmt.Printf("Format %s:\n", format)
fmt.Printf(" MIME: %s\n", audioFormat.MIME)
fmt.Printf(" Encoding: %s\n", audioFormat.Encoding)
fmt.Printf(" Sample Rate: %d Hz\n", audioFormat.SampleRate)
fmt.Printf(" Channels: %d\n", audioFormat.Channels)
fmt.Printf(" Bit Depth: %d\n", audioFormat.BitDepth)
stream.Close()
}
}
Output:
Example (SpeakToolDataURL) ¶
Example demonstrates creating an audio data URL for web usage
package main
import (
"context"
"encoding/base64"
"fmt"
"log"
"strings"
"github.com/recera/gai/media"
"github.com/recera/gai/tools"
)
func main() {
// Create mock provider for demonstration
mockTTS := &mockProvider{}
// Create speak tool configured for data URLs
speakTool := media.NewSpeakTool(
mockTTS,
media.WithSpeakToolDefaultFormat(media.FormatMP3),
)
ctx := context.Background()
meta := tools.Meta{
CallID: "example",
RequestID: "req-123",
}
// Execute speak tool
input := media.SpeakInput{
Text: "Hello, world!",
Format: media.FormatMP3,
ReturnDataURL: true,
}
// Execute through the tool interface
result, err := executeTool(speakTool, ctx, input, meta)
if err != nil {
log.Fatal(err)
}
output := result.(media.SpeakOutput)
if output.Success {
// Extract the base64 data
parts := strings.Split(output.DataURL, ",")
if len(parts) == 2 {
decoded, _ := base64.StdEncoding.DecodeString(parts[1])
fmt.Printf("Generated audio data URL with %d bytes\n", len(decoded))
fmt.Printf("Can be used in HTML: <audio src='%s...' />\n",
output.DataURL[:50])
}
}
}
type mockProvider struct{}
func (m *mockProvider) Synthesize(ctx context.Context, req media.SpeechRequest) (media.SpeechStream, error) {
return &mockStream{
data: []byte("mock audio data"),
}, nil
}
func (m *mockProvider) ListVoices(ctx context.Context) ([]media.Voice, error) {
return []media.Voice{}, nil
}
type mockStream struct {
data []byte
}
func (s *mockStream) Chunks() <-chan []byte {
ch := make(chan []byte, 1)
ch <- s.data
close(ch)
return ch
}
func (s *mockStream) Format() media.AudioFormat {
return media.AudioFormat{
MIME: media.MimeMP3,
Encoding: media.FormatMP3,
}
}
func (s *mockStream) Close() error {
return nil
}
func (s *mockStream) Error() error {
return nil
}
func executeTool(tool tools.Handle, ctx context.Context, input media.SpeakInput, meta tools.Meta) (interface{}, error) {
return media.SpeakOutput{
Success: true,
DataURL: "data:audio/mpeg;base64,bW9jayBhdWRpbyBkYXRh",
Format: media.FormatMP3,
SizeBytes: 15,
DurationSeconds: 1.0,
}, nil
}
Output:
Example (VoiceCloning) ¶
Example demonstrates voice cloning workflow
package main
import (
"context"
"fmt"
"log"
"os"
"strings"
"github.com/recera/gai/core"
"github.com/recera/gai/media"
)
func main() {
// This example shows a complete voice cloning workflow:
// 1. Transcribe reference audio
// 2. Generate new speech in the same style
// 3. Verify the output
ctx := context.Background()
// Step 1: Transcribe reference audio to get the text
whisper := media.NewWhisper(
media.WithWhisperAPIKey(os.Getenv("OPENAI_API_KEY")),
)
referenceAudio := core.BlobRef{
Kind: core.BlobURL,
URL: "https://example.com/reference-voice.wav",
MIME: "audio/wav",
}
transcription, err := whisper.Transcribe(ctx, media.TranscriptionRequest{
Audio: referenceAudio,
Language: "en",
})
if err != nil {
log.Fatal(err)
}
fmt.Printf("Reference text: %s\n", transcription.Text)
// Step 2: Generate new speech with similar voice characteristics
// (Note: This would require a voice cloning service or custom voice ID)
elevenlabs := media.NewElevenLabs(
media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
)
newText := "This is new text spoken in a similar voice style."
stream, err := elevenlabs.Synthesize(ctx, media.SpeechRequest{
Text: newText,
Voice: "custom-cloned-voice", // Would be your cloned voice ID
Format: media.FormatMP3,
Stability: 0.75, // Adjust for voice consistency
SimilarityBoost: 0.90, // High similarity to original
})
if err != nil {
log.Fatal(err)
}
defer stream.Close()
// Collect synthesized audio
var audioData []byte
for chunk := range stream.Chunks() {
audioData = append(audioData, chunk...)
}
// Step 3: Optionally transcribe the generated audio to verify
verification, err := whisper.Transcribe(ctx, media.TranscriptionRequest{
Audio: core.BlobRef{
Kind: core.BlobBytes,
Bytes: audioData,
MIME: "audio/mpeg",
},
})
if err != nil {
log.Fatal(err)
}
fmt.Printf("Generated text verification: %s\n", verification.Text)
fmt.Printf("Match: %v\n", strings.Contains(verification.Text, "similar voice"))
}
Output:
Index ¶
- Constants
- func NewSpeakTool(provider SpeechProvider, opts ...SpeakToolOption) tools.Handle
- type AudioFormat
- type Cartesia
- type CartesiaOption
- type Deepgram
- type DeepgramOption
- type ElevenLabs
- type ElevenLabsOption
- type ProviderConfig
- type SpeakInput
- type SpeakOutput
- type SpeakToolOption
- type SpeakerSegment
- type SpeechProvider
- type SpeechRequest
- type SpeechStream
- type TranscriptionAlternative
- type TranscriptionEvent
- type TranscriptionEventType
- type TranscriptionProvider
- type TranscriptionRequest
- type TranscriptionResult
- type TranscriptionStream
- type Voice
- type Whisper
- type WhisperOption
- type WordTiming
Examples ¶
Constants ¶
const ( FormatMP3 = "mp3" FormatWAV = "wav" FormatOGG = "ogg" FormatOpus = "opus" FormatFLAC = "flac" FormatPCM = "pcm" FormatWebM = "webm" FormatMPEG = "mpeg" FormatULaw = "ulaw" FormatMuLaw = "mulaw" )
Common audio formats
const ( MimeMP3 = "audio/mpeg" MimeWAV = "audio/wav" MimeOGG = "audio/ogg" MimeOpus = "audio/opus" MimeFLAC = "audio/flac" MimeWebM = "audio/webm" MimeBasic = "audio/basic" // for ulaw )
Common MIME types
Variables ¶
This section is empty.
Functions ¶
func NewSpeakTool ¶
func NewSpeakTool(provider SpeechProvider, opts ...SpeakToolOption) tools.Handle
NewSpeakTool creates a tool that allows LLMs to trigger TTS.
Example ¶
Example demonstrates the Speak tool for LLM-triggered TTS
package main
import (
"context"
"fmt"
"log"
"os"
"time"
"github.com/recera/gai/core"
"github.com/recera/gai/media"
"github.com/recera/gai/providers/openai"
"github.com/recera/gai/tools"
)
func main() {
// Create TTS provider
tts := media.NewElevenLabs(
media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
)
// Create Speak tool
speakTool := media.NewSpeakTool(
tts,
media.WithSpeakToolTempDir("/tmp/audio"),
media.WithSpeakToolMaxTextLength(1000),
media.WithSpeakToolDefaultFormat(media.FormatMP3),
media.WithSpeakToolCleanup(5*time.Minute),
)
// Create AI provider
ai := openai.New(
openai.WithAPIKey(os.Getenv("OPENAI_API_KEY")),
openai.WithModel("gpt-4o-mini"),
)
// Use the speak tool in an AI request
ctx := context.Background()
result, err := ai.GenerateText(ctx, core.Request{
Messages: []core.Message{
{Role: core.System, Parts: []core.Part{
core.Text{Text: "You are a helpful assistant. When asked to speak or say something aloud, use the speak tool."},
}},
{Role: core.User, Parts: []core.Part{
core.Text{Text: "Please say 'Welcome to the GAI framework' out loud."},
}},
},
Tools: []core.ToolHandle{tools.NewCoreAdapter(speakTool)},
ToolChoice: core.ToolAuto,
})
if err != nil {
log.Fatal(err)
}
fmt.Printf("AI Response: %s\n", result.Text)
// The AI will have called the speak tool
// Check the steps for tool execution results
for _, step := range result.Steps {
for _, execution := range step.ToolResults {
if execution.Name == "speak" {
// The result contains the audio file path or data URL
fmt.Printf("Audio generated: %+v\n", execution.Result)
}
}
}
}
Output:
Types ¶
type AudioFormat ¶
type AudioFormat struct {
// MIME type (e.g., "audio/mpeg", "audio/wav").
MIME string
// Sample rate in Hz (e.g., 44100, 16000).
SampleRate int
// Number of channels (1 for mono, 2 for stereo).
Channels int
// Bit depth (e.g., 16, 24).
BitDepth int
// Encoding format (e.g., "pcm", "mp3", "opus").
Encoding string
// Bitrate in bits per second (for compressed formats).
Bitrate int
}
AudioFormat describes audio encoding and properties.
type Cartesia ¶
type Cartesia struct {
// contains filtered or unexported fields
}
Cartesia implements SpeechProvider for Cartesia TTS API.
func NewCartesia ¶
func NewCartesia(opts ...CartesiaOption) *Cartesia
NewCartesia creates a new Cartesia TTS provider.
func (*Cartesia) ListVoices ¶
ListVoices returns available Cartesia voices.
func (*Cartesia) Synthesize ¶
func (c *Cartesia) Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)
Synthesize converts text to speech using Cartesia.
type CartesiaOption ¶
type CartesiaOption func(*Cartesia)
CartesiaOption configures the Cartesia provider.
func WithCartesiaAPIKey ¶
func WithCartesiaAPIKey(key string) CartesiaOption
WithCartesiaAPIKey sets the API key.
func WithCartesiaBaseURL ¶
func WithCartesiaBaseURL(url string) CartesiaOption
WithCartesiaBaseURL sets a custom base URL.
func WithCartesiaModel ¶
func WithCartesiaModel(model string) CartesiaOption
WithCartesiaModel sets the default model.
func WithCartesiaVoice ¶
func WithCartesiaVoice(voice string) CartesiaOption
WithCartesiaVoice sets the default voice.
type Deepgram ¶
type Deepgram struct {
// contains filtered or unexported fields
}
Deepgram implements TranscriptionProvider for Deepgram API.
func NewDeepgram ¶
func NewDeepgram(opts ...DeepgramOption) *Deepgram
NewDeepgram creates a new Deepgram STT provider.
func (*Deepgram) Transcribe ¶
func (d *Deepgram) Transcribe(ctx context.Context, req TranscriptionRequest) (*TranscriptionResult, error)
Transcribe converts audio to text using Deepgram.
Example ¶
Example demonstrates real-time transcription with Deepgram
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/recera/gai/core"
"github.com/recera/gai/media"
)
func main() {
// Create Deepgram provider
deepgram := media.NewDeepgram(
media.WithDeepgramAPIKey(os.Getenv("DEEPGRAM_API_KEY")),
media.WithDeepgramModel("nova-2"),
)
// Transcribe with advanced features
ctx := context.Background()
result, err := deepgram.Transcribe(ctx, media.TranscriptionRequest{
Audio: core.BlobRef{
Kind: core.BlobBytes,
Bytes: getAudioBytes(), // Your audio data
MIME: "audio/wav",
},
Language: "en",
Punctuate: true,
Diarize: true, // Enable speaker identification
FilterProfanity: false,
MaxAlternatives: 2,
})
if err != nil {
log.Fatal(err)
}
fmt.Printf("Transcription: %s\n", result.Text)
fmt.Printf("Confidence: %.2f\n", result.Confidence)
// Show speaker segments
if len(result.Speakers) > 0 {
fmt.Println("\nSpeaker segments:")
for _, segment := range result.Speakers {
fmt.Printf(" Speaker %d (%v-%v): %s\n",
segment.Speaker, segment.Start, segment.End, segment.Text)
}
}
// Show alternatives
if len(result.Alternatives) > 0 {
fmt.Println("\nAlternative transcriptions:")
for i, alt := range result.Alternatives {
fmt.Printf(" %d. (confidence %.2f): %s\n",
i+1, alt.Confidence, alt.Text)
}
}
}
func getAudioBytes() []byte {
return []byte("fake audio data for example")
}
Output:
func (*Deepgram) TranscribeStream ¶
func (d *Deepgram) TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)
TranscribeStream processes streaming audio input using WebSocket.
type DeepgramOption ¶
type DeepgramOption func(*Deepgram)
DeepgramOption configures the Deepgram provider.
func WithDeepgramAPIKey ¶
func WithDeepgramAPIKey(key string) DeepgramOption
WithDeepgramAPIKey sets the API key.
func WithDeepgramBaseURL ¶
func WithDeepgramBaseURL(url string) DeepgramOption
WithDeepgramBaseURL sets a custom base URL.
func WithDeepgramModel ¶
func WithDeepgramModel(model string) DeepgramOption
WithDeepgramModel sets the default model.
type ElevenLabs ¶
type ElevenLabs struct {
// contains filtered or unexported fields
}
ElevenLabs implements SpeechProvider for ElevenLabs TTS API.
func NewElevenLabs ¶
func NewElevenLabs(opts ...ElevenLabsOption) *ElevenLabs
NewElevenLabs creates a new ElevenLabs TTS provider.
func (*ElevenLabs) ListVoices ¶
func (el *ElevenLabs) ListVoices(ctx context.Context) ([]Voice, error)
ListVoices returns available ElevenLabs voices.
Example ¶
Example demonstrates listing available voices
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/recera/gai/media"
)
func main() {
tts := media.NewElevenLabs(
media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
)
ctx := context.Background()
voices, err := tts.ListVoices(ctx)
if err != nil {
log.Fatal(err)
}
// Display first 3 voices
for i, voice := range voices {
if i >= 3 {
break
}
fmt.Printf("Voice: %s\n", voice.Name)
fmt.Printf(" ID: %s\n", voice.ID)
fmt.Printf(" Gender: %s, Age: %s\n", voice.Gender, voice.Age)
fmt.Printf(" Premium: %v\n", voice.Premium)
fmt.Println()
}
}
Output:
func (*ElevenLabs) Synthesize ¶
func (el *ElevenLabs) Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)
Synthesize converts text to speech using ElevenLabs.
Example ¶
Example demonstrates basic TTS synthesis with ElevenLabs
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/recera/gai/media"
)
func main() {
// Create TTS provider
tts := media.NewElevenLabs(
media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
)
// Synthesize speech
ctx := context.Background()
stream, err := tts.Synthesize(ctx, media.SpeechRequest{
Text: "Hello from the GAI framework! This is a test of text-to-speech synthesis.",
Voice: "Rachel",
Format: media.FormatMP3,
Speed: 1.0,
})
if err != nil {
log.Fatal(err)
}
defer stream.Close()
// Collect audio data
var totalBytes int
for chunk := range stream.Chunks() {
totalBytes += len(chunk)
// In real use, write to file or stream to client
}
fmt.Printf("Generated %d bytes of MP3 audio\n", totalBytes)
}
Output:
type ElevenLabsOption ¶
type ElevenLabsOption func(*ElevenLabs)
ElevenLabsOption configures the ElevenLabs provider.
func WithElevenLabsAPIKey ¶
func WithElevenLabsAPIKey(key string) ElevenLabsOption
WithElevenLabsAPIKey sets the API key.
func WithElevenLabsBaseURL ¶
func WithElevenLabsBaseURL(url string) ElevenLabsOption
WithElevenLabsBaseURL sets a custom base URL.
func WithElevenLabsModel ¶
func WithElevenLabsModel(model string) ElevenLabsOption
WithElevenLabsModel sets the default model.
func WithElevenLabsVoice ¶
func WithElevenLabsVoice(voice string) ElevenLabsOption
WithElevenLabsVoice sets the default voice.
type ProviderConfig ¶
type ProviderConfig struct {
// API key for authentication.
APIKey string
// Base URL for the API.
BaseURL string
// Organization ID (if applicable).
Organization string
// Project ID (if applicable).
Project string
// Default voice to use.
DefaultVoice string
// Default model to use.
DefaultModel string
// Default audio format.
DefaultFormat string
// Request timeout.
Timeout time.Duration
// Maximum retries for failed requests.
MaxRetries int
// Custom HTTP headers.
Headers map[string]string
}
ProviderConfig holds common configuration for audio providers.
type SpeakInput ¶
type SpeakInput struct {
// Text to speak (required).
Text string `json:"text" jsonschema:"required,description=The text to convert to speech"`
// Voice to use (optional, uses provider default if not specified).
Voice string `json:"voice,omitempty" jsonschema:"description=Voice ID or name to use for synthesis"`
// Format for the output (optional, defaults to mp3).
Format string `json:"format,omitempty" jsonschema:"enum=mp3,enum=wav,enum=ogg,description=Audio format for the output"`
// Speed of speech (optional, 0.5 to 2.0, default 1.0).
Speed float32 `json:"speed,omitempty" jsonschema:"minimum=0.5,maximum=2.0,description=Speaking speed (0.5 to 2.0)"`
// Save to file (optional, if true saves to temp file and returns path).
SaveToFile bool `json:"save_to_file,omitempty" jsonschema:"description=Save audio to a temporary file"`
// Return as data URL (optional, if true returns base64 data URL).
ReturnDataURL bool `json:"return_data_url,omitempty" jsonschema:"description=Return audio as a base64 data URL"`
}
SpeakInput defines the input for the Speak tool.
type SpeakOutput ¶
type SpeakOutput struct {
// Success indicates whether the speech synthesis succeeded.
Success bool `json:"success"`
// FilePath is the path to the saved audio file (if SaveToFile was true).
FilePath string `json:"file_path,omitempty"`
// DataURL is the base64-encoded data URL (if ReturnDataURL was true).
DataURL string `json:"data_url,omitempty"`
// Format of the audio.
Format string `json:"format"`
// Duration in seconds (estimated).
DurationSeconds float64 `json:"duration_seconds,omitempty"`
// Size in bytes.
SizeBytes int `json:"size_bytes"`
// Error message if synthesis failed.
Error string `json:"error,omitempty"`
}
SpeakOutput defines the output from the Speak tool.
type SpeakToolOption ¶
type SpeakToolOption func(*speakToolConfig)
SpeakToolOption configures the Speak tool.
func WithSpeakToolCleanup ¶
func WithSpeakToolCleanup(duration time.Duration) SpeakToolOption
WithSpeakToolCleanup sets automatic file cleanup duration.
func WithSpeakToolDefaultFormat ¶
func WithSpeakToolDefaultFormat(format string) SpeakToolOption
WithSpeakToolDefaultFormat sets the default audio format.
func WithSpeakToolMaxTextLength ¶
func WithSpeakToolMaxTextLength(length int) SpeakToolOption
WithSpeakToolMaxTextLength sets the maximum text length.
func WithSpeakToolTempDir ¶
func WithSpeakToolTempDir(dir string) SpeakToolOption
WithSpeakToolTempDir sets the temporary directory for audio files.
type SpeakerSegment ¶
SpeakerSegment identifies a speaker's portion of the audio.
type SpeechProvider ¶
type SpeechProvider interface {
// Synthesize converts text to speech, returning a stream of audio chunks.
Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)
// ListVoices returns available voices for this provider.
ListVoices(ctx context.Context) ([]Voice, error)
}
SpeechProvider synthesizes text into speech audio.
type SpeechRequest ¶
type SpeechRequest struct {
// Text to synthesize (required).
Text string
// Voice ID or name (provider-specific).
Voice string
// Model to use (provider-specific, e.g., "eleven_multilingual_v2").
Model string
// Output format (e.g., "mp3", "pcm", "opus").
Format string
// Speaking speed (0.5 to 2.0, 1.0 is normal).
Speed float32
// Voice stability (provider-specific, 0.0 to 1.0).
Stability float32
// Voice similarity boost (provider-specific, 0.0 to 1.0).
SimilarityBoost float32
// Additional provider-specific options.
Options map[string]any
}
SpeechRequest configures text-to-speech synthesis.
type SpeechStream ¶
type SpeechStream interface {
// Chunks returns a channel of audio data chunks.
Chunks() <-chan []byte
// Format returns the audio format information.
Format() AudioFormat
// Close stops the stream and releases resources.
Close() error
// Error returns any error that occurred during streaming.
Error() error
}
SpeechStream provides streaming audio output from TTS.
type TranscriptionAlternative ¶
TranscriptionAlternative represents an alternative transcription.
type TranscriptionEvent ¶
type TranscriptionEvent struct {
// Type of event.
Type TranscriptionEventType
// Transcribed text (for partial and final results).
Text string
// Whether this is a final result.
IsFinal bool
// Word timing (if available).
Words []WordTiming
// Error (for error events).
Error error
}
TranscriptionEvent represents a real-time transcription update.
type TranscriptionEventType ¶
type TranscriptionEventType int
TranscriptionEventType identifies the type of transcription event.
const ( TranscriptionPartial TranscriptionEventType = iota TranscriptionFinal TranscriptionError TranscriptionEnd )
type TranscriptionProvider ¶
type TranscriptionProvider interface {
// Transcribe converts audio to text.
Transcribe(ctx context.Context, req TranscriptionRequest) (*TranscriptionResult, error)
// TranscribeStream processes streaming audio input.
TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)
}
TranscriptionProvider converts speech audio to text.
type TranscriptionRequest ¶
type TranscriptionRequest struct {
// Audio input source.
Audio core.BlobRef
// Language code (e.g., "en", "es", "fr").
Language string
// Model to use (provider-specific).
Model string
// Enable punctuation restoration.
Punctuate bool
// Enable speaker diarization.
Diarize bool
// Enable profanity filtering.
FilterProfanity bool
// Custom vocabulary/keywords.
Keywords []string
// Maximum alternatives to return.
MaxAlternatives int
// Additional provider-specific options.
Options map[string]any
}
TranscriptionRequest configures speech-to-text transcription.
type TranscriptionResult ¶
type TranscriptionResult struct {
// Primary transcription text.
Text string
// Alternative transcriptions with confidence scores.
Alternatives []TranscriptionAlternative
// Word-level timing information.
Words []WordTiming
// Detected language (if auto-detected).
Language string
// Overall confidence score (0.0 to 1.0).
Confidence float32
// Duration of the audio.
Duration time.Duration
// Speaker segments (if diarization enabled).
Speakers []SpeakerSegment
}
TranscriptionResult contains the transcribed text and metadata.
type TranscriptionStream ¶
type TranscriptionStream interface {
// Events returns a channel of transcription events.
Events() <-chan TranscriptionEvent
// Close stops the stream and releases resources.
Close() error
}
TranscriptionStream provides real-time transcription of streaming audio.
type Voice ¶
type Voice struct {
// Unique voice identifier.
ID string
// Human-readable voice name.
Name string
// Voice description.
Description string
// Language codes supported by this voice.
Languages []string
// Voice gender (if specified).
Gender string
// Voice age category (e.g., "young", "middle-aged", "old").
Age string
// Voice style/use case tags (e.g., "conversational", "narrative").
Tags []string
// Preview audio URL (if available).
PreviewURL string
// Whether this is a premium voice.
Premium bool
}
Voice represents an available TTS voice.
type Whisper ¶
type Whisper struct {
// contains filtered or unexported fields
}
Whisper implements TranscriptionProvider for OpenAI Whisper API or compatible servers.
func NewWhisper ¶
func NewWhisper(opts ...WhisperOption) *Whisper
NewWhisper creates a new Whisper STT provider.
func (*Whisper) Transcribe ¶
func (w *Whisper) Transcribe(ctx context.Context, req TranscriptionRequest) (*TranscriptionResult, error)
Transcribe converts audio to text using Whisper.
Example ¶
Example demonstrates transcribing audio with Whisper
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/recera/gai/core"
"github.com/recera/gai/media"
)
func main() {
// Create STT provider
stt := media.NewWhisper(
media.WithWhisperAPIKey(os.Getenv("OPENAI_API_KEY")),
)
// Transcribe audio from URL
ctx := context.Background()
result, err := stt.Transcribe(ctx, media.TranscriptionRequest{
Audio: core.BlobRef{
Kind: core.BlobURL,
URL: "https://example.com/sample-audio.wav",
MIME: "audio/wav",
},
Language: "en",
Punctuate: true,
Keywords: []string{"GAI", "framework", "artificial intelligence"},
})
if err != nil {
log.Fatal(err)
}
fmt.Printf("Transcription: %s\n", result.Text)
fmt.Printf("Language: %s\n", result.Language)
fmt.Printf("Duration: %v\n", result.Duration)
// Show word timings if available
if len(result.Words) > 0 {
fmt.Println("\nFirst 5 words with timing:")
for i, word := range result.Words {
if i >= 5 {
break
}
fmt.Printf(" %s: %v-%v\n", word.Word, word.Start, word.End)
}
}
}
Output:
func (*Whisper) TranscribeStream ¶
func (w *Whisper) TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)
TranscribeStream processes streaming audio input (not supported by standard Whisper).
type WhisperOption ¶
type WhisperOption func(*Whisper)
WhisperOption configures the Whisper provider.
func WithWhisperAPIKey ¶
func WithWhisperAPIKey(key string) WhisperOption
WithWhisperAPIKey sets the API key.
func WithWhisperBaseURL ¶
func WithWhisperBaseURL(url string) WhisperOption
WithWhisperBaseURL sets a custom base URL (for self-hosted Whisper).
func WithWhisperModel ¶
func WithWhisperModel(model string) WhisperOption
WithWhisperModel sets the default model.
func WithWhisperOrganization ¶
func WithWhisperOrganization(org string) WhisperOption
WithWhisperOrganization sets the OpenAI organization ID.