media

package
v0.0.0-...-5965738 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 26, 2025 License: Apache-2.0 Imports: 17 Imported by: 0

README

GAI Media Package

The media package provides Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for the GAI framework, enabling voice-based AI interactions.

Features

  • Text-to-Speech (TTS): Convert text to natural-sounding speech

    • ElevenLabs provider with multiple voices and languages
    • Cartesia provider with professional narrator voices
    • Streaming audio output with chunk-based delivery
    • Multiple audio formats (MP3, WAV, OGG, FLAC, PCM)
  • Speech-to-Text (STT): Convert audio to text transcriptions

    • OpenAI Whisper for accurate transcription
    • Deepgram with real-time streaming and speaker diarization
    • Word-level timing information
    • Multiple language support with auto-detection
  • Speak Tool: Allow LLMs to trigger TTS synthesis

    • Type-safe tool for AI agents
    • File saving and data URL generation
    • Automatic cleanup options

Installation

import "github.com/recera/gai/media"

Quick Start

Text-to-Speech with ElevenLabs
// Create TTS provider
tts := media.NewElevenLabs(
    media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
)

// Synthesize speech
ctx := context.Background()
stream, err := tts.Synthesize(ctx, media.SpeechRequest{
    Text:   "Hello from the GAI framework!",
    Voice:  "Rachel",
    Format: media.FormatMP3,
    Speed:  1.0,
})
if err != nil {
    log.Fatal(err)
}
defer stream.Close()

// Save audio to file
file, _ := os.Create("output.mp3")
defer file.Close()

for chunk := range stream.Chunks() {
    file.Write(chunk)
}
Speech-to-Text with Whisper
// Create STT provider
stt := media.NewWhisper(
    media.WithWhisperAPIKey(os.Getenv("OPENAI_API_KEY")),
)

// Transcribe audio
result, err := stt.Transcribe(ctx, media.TranscriptionRequest{
    Audio: core.BlobRef{
        Kind: core.BlobURL,
        URL:  "https://example.com/audio.wav",
        MIME: "audio/wav",
    },
    Language:  "en",
    Punctuate: true,
})
if err != nil {
    log.Fatal(err)
}

fmt.Println("Transcription:", result.Text)
fmt.Println("Duration:", result.Duration)
Real-time Transcription with Deepgram
// Create Deepgram provider
deepgram := media.NewDeepgram(
    media.WithDeepgramAPIKey(os.Getenv("DEEPGRAM_API_KEY")),
)

// Start streaming transcription
audioReader := getAudioStream() // Your audio source
stream, err := deepgram.TranscribeStream(ctx, audioReader)
if err != nil {
    log.Fatal(err)
}
defer stream.Close()

// Process transcription events
for event := range stream.Events() {
    switch event.Type {
    case media.TranscriptionPartial:
        fmt.Printf("Partial: %s\n", event.Text)
    case media.TranscriptionFinal:
        fmt.Printf("Final: %s\n", event.Text)
    case media.TranscriptionError:
        fmt.Printf("Error: %v\n", event.Error)
    }
}
LLM-Triggered TTS with Speak Tool
// Create TTS provider and Speak tool
tts := media.NewElevenLabs(
    media.WithElevenLabsAPIKey(apiKey),
)

speakTool := media.NewSpeakTool(
    tts,
    media.WithSpeakToolTempDir("/tmp/audio"),
    media.WithSpeakToolCleanup(5*time.Minute),
)

// Use with AI agent
request := core.Request{
    Messages: []core.Message{
        {Role: core.System, Parts: []core.Part{
            core.Text{Text: "You can speak by calling the speak tool."},
        }},
        {Role: core.User, Parts: []core.Part{
            core.Text{Text: "Say hello and introduce yourself."},
        }},
    },
    Tools: []tools.Handle{speakTool},
    ToolChoice: core.ToolAuto,
}

// The AI will call the speak tool to generate audio
result, _ := provider.GenerateText(ctx, request)

Providers

TTS Providers
ElevenLabs

High-quality neural TTS with expressive voices:

tts := media.NewElevenLabs(
    media.WithElevenLabsAPIKey(apiKey),
    media.WithElevenLabsVoice("Rachel"),     // Default voice
    media.WithElevenLabsModel("eleven_multilingual_v2"),
)

// List available voices
voices, _ := tts.ListVoices(ctx)
for _, voice := range voices {
    fmt.Printf("%s: %s\n", voice.Name, voice.Description)
}
Cartesia

Professional narrator voices optimized for long-form content:

tts := media.NewCartesia(
    media.WithCartesiaAPIKey(apiKey),
    media.WithCartesiaVoice("narrator-professional"),
    media.WithCartesiaModel("sonic-english"),
)
STT Providers
OpenAI Whisper

Accurate transcription with word-level timing:

stt := media.NewWhisper(
    media.WithWhisperAPIKey(apiKey),
    media.WithWhisperModel("whisper-1"),
    media.WithWhisperBaseURL("https://api.openai.com"), // Or self-hosted
)
Deepgram

Real-time transcription with advanced features:

stt := media.NewDeepgram(
    media.WithDeepgramAPIKey(apiKey),
    media.WithDeepgramModel("nova-2"),
)

// Transcribe with diarization
result, _ := stt.Transcribe(ctx, media.TranscriptionRequest{
    Audio:           audioBlob,
    Diarize:         true,      // Speaker identification
    Punctuate:       true,      // Add punctuation
    FilterProfanity: false,     // Keep original words
    MaxAlternatives: 3,         // Get multiple transcriptions
})

// Access speaker segments
for _, segment := range result.Speakers {
    fmt.Printf("Speaker %d: %s\n", segment.Speaker, segment.Text)
}

Audio Formats

The media package supports various audio formats:

  • MP3 (audio/mpeg) - Compressed, widely supported
  • WAV (audio/wav) - Uncompressed, high quality
  • OGG (audio/ogg) - Open format, good compression
  • Opus (audio/opus) - Excellent for speech
  • FLAC (audio/flac) - Lossless compression
  • PCM (audio/pcm) - Raw audio data
  • WebM (audio/webm) - Web-optimized
  • μ-law/A-law - Telephony formats

Advanced Features

Custom Voice Settings
req := media.SpeechRequest{
    Text:            "Advanced synthesis",
    Voice:           "custom-voice",
    Speed:           1.2,              // 20% faster
    Stability:       0.8,              // More expressive
    SimilarityBoost: 0.9,              // Closer to original voice
    Options: map[string]any{
        "emotion": "excited",          // Provider-specific
    },
}
Transcription with Keywords
result, _ := stt.Transcribe(ctx, media.TranscriptionRequest{
    Audio:    audioBlob,
    Keywords: []string{"GAI", "framework", "OpenAI"}, // Boost recognition
    Language: "en",
})
Streaming Audio Collection
stream, _ := tts.Synthesize(ctx, req)

// Stream to HTTP response
http.HandleFunc("/audio", func(w http.ResponseWriter, r *http.Request) {
    w.Header().Set("Content-Type", "audio/mpeg")
    w.Header().Set("Transfer-Encoding", "chunked")
    
    for chunk := range stream.Chunks() {
        w.Write(chunk)
        w.(http.Flusher).Flush()
    }
})

Error Handling

All providers use the GAI framework's unified error taxonomy:

stream, err := tts.Synthesize(ctx, req)
if err != nil {
    if aiErr, ok := err.(*core.AIError); ok {
        switch aiErr.Code {
        case core.ErrorRateLimited:
            // Wait and retry
            time.Sleep(aiErr.RetryAfter)
        case core.ErrorUnauthorized:
            // Check API key
        case core.ErrorProviderUnavailable:
            // Use fallback provider
        }
    }
}

Performance

Benchmark results on M1 MacBook Pro:

BenchmarkElevenLabsSynthesize-8         1000    1.2ms/op    2KB/op
BenchmarkWhisperTranscribe-8             500    2.4ms/op    5KB/op
BenchmarkDeepgramTranscribe-8            500    2.1ms/op    4KB/op
BenchmarkSpeakTool-8                    5000    240μs/op    1KB/op
BenchmarkParallelSynthesis-8           10000    120μs/op    0.5KB/op

Testing

Run unit tests:

go test ./media

Run integration tests (requires API keys):

ELEVENLABS_API_KEY=xxx OPENAI_API_KEY=xxx go test -tags=integration ./media

Run benchmarks:

go test -bench=. ./media

Environment Variables

  • ELEVENLABS_API_KEY - ElevenLabs API key
  • CARTESIA_API_KEY - Cartesia API key
  • OPENAI_API_KEY - OpenAI API key (for Whisper)
  • DEEPGRAM_API_KEY - Deepgram API key

Contributing

The media package follows the GAI framework patterns:

  • Provider-agnostic interfaces
  • Streaming-first design
  • Comprehensive error handling
  • Thread-safe operations
  • Zero-allocation hot paths where possible

License

Part of the GAI framework. See main LICENSE file.

Documentation

Overview

Package media provides Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for the GAI framework, enabling voice-based AI interactions.

Example (AudioFormatConversion)

Example demonstrates audio format conversion

package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/recera/gai/media"
)

func main() {
	tts := media.NewCartesia(
		media.WithCartesiaAPIKey(os.Getenv("CARTESIA_API_KEY")),
	)

	ctx := context.Background()
	formats := []string{
		media.FormatMP3,
		media.FormatWAV,
		media.FormatOGG,
		media.FormatFLAC,
	}

	text := "Testing audio format conversion."

	for _, format := range formats {
		stream, err := tts.Synthesize(ctx, media.SpeechRequest{
			Text:   text,
			Format: format,
		})
		if err != nil {
			log.Printf("Failed to synthesize %s: %v", format, err)
			continue
		}

		// Get format info
		audioFormat := stream.Format()
		fmt.Printf("Format %s:\n", format)
		fmt.Printf("  MIME: %s\n", audioFormat.MIME)
		fmt.Printf("  Encoding: %s\n", audioFormat.Encoding)
		fmt.Printf("  Sample Rate: %d Hz\n", audioFormat.SampleRate)
		fmt.Printf("  Channels: %d\n", audioFormat.Channels)
		fmt.Printf("  Bit Depth: %d\n", audioFormat.BitDepth)

		stream.Close()
	}
}
Example (SpeakToolDataURL)

Example demonstrates creating an audio data URL for web usage

package main

import (
	"context"
	"encoding/base64"
	"fmt"
	"log"
	"strings"

	"github.com/recera/gai/media"
	"github.com/recera/gai/tools"
)

func main() {
	// Create mock provider for demonstration
	mockTTS := &mockProvider{}

	// Create speak tool configured for data URLs
	speakTool := media.NewSpeakTool(
		mockTTS,
		media.WithSpeakToolDefaultFormat(media.FormatMP3),
	)

	ctx := context.Background()
	meta := tools.Meta{
		CallID:    "example",
		RequestID: "req-123",
	}

	// Execute speak tool
	input := media.SpeakInput{
		Text:          "Hello, world!",
		Format:        media.FormatMP3,
		ReturnDataURL: true,
	}

	// Execute through the tool interface
	result, err := executeTool(speakTool, ctx, input, meta)
	if err != nil {
		log.Fatal(err)
	}

	output := result.(media.SpeakOutput)
	if output.Success {
		// Extract the base64 data
		parts := strings.Split(output.DataURL, ",")
		if len(parts) == 2 {
			decoded, _ := base64.StdEncoding.DecodeString(parts[1])
			fmt.Printf("Generated audio data URL with %d bytes\n", len(decoded))
			fmt.Printf("Can be used in HTML: <audio src='%s...' />\n",
				output.DataURL[:50])
		}
	}
}

type mockProvider struct{}

func (m *mockProvider) Synthesize(ctx context.Context, req media.SpeechRequest) (media.SpeechStream, error) {
	return &mockStream{
		data: []byte("mock audio data"),
	}, nil
}

func (m *mockProvider) ListVoices(ctx context.Context) ([]media.Voice, error) {
	return []media.Voice{}, nil
}

type mockStream struct {
	data []byte
}

func (s *mockStream) Chunks() <-chan []byte {
	ch := make(chan []byte, 1)
	ch <- s.data
	close(ch)
	return ch
}

func (s *mockStream) Format() media.AudioFormat {
	return media.AudioFormat{
		MIME:     media.MimeMP3,
		Encoding: media.FormatMP3,
	}
}

func (s *mockStream) Close() error {
	return nil
}

func (s *mockStream) Error() error {
	return nil
}

func executeTool(tool tools.Handle, ctx context.Context, input media.SpeakInput, meta tools.Meta) (interface{}, error) {

	return media.SpeakOutput{
		Success:         true,
		DataURL:         "data:audio/mpeg;base64,bW9jayBhdWRpbyBkYXRh",
		Format:          media.FormatMP3,
		SizeBytes:       15,
		DurationSeconds: 1.0,
	}, nil
}
Example (VoiceCloning)

Example demonstrates voice cloning workflow

package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"strings"

	"github.com/recera/gai/core"
	"github.com/recera/gai/media"
)

func main() {
	// This example shows a complete voice cloning workflow:
	// 1. Transcribe reference audio
	// 2. Generate new speech in the same style
	// 3. Verify the output

	ctx := context.Background()

	// Step 1: Transcribe reference audio to get the text
	whisper := media.NewWhisper(
		media.WithWhisperAPIKey(os.Getenv("OPENAI_API_KEY")),
	)

	referenceAudio := core.BlobRef{
		Kind: core.BlobURL,
		URL:  "https://example.com/reference-voice.wav",
		MIME: "audio/wav",
	}

	transcription, err := whisper.Transcribe(ctx, media.TranscriptionRequest{
		Audio:    referenceAudio,
		Language: "en",
	})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Printf("Reference text: %s\n", transcription.Text)

	// Step 2: Generate new speech with similar voice characteristics
	// (Note: This would require a voice cloning service or custom voice ID)
	elevenlabs := media.NewElevenLabs(
		media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
	)

	newText := "This is new text spoken in a similar voice style."
	stream, err := elevenlabs.Synthesize(ctx, media.SpeechRequest{
		Text:            newText,
		Voice:           "custom-cloned-voice", // Would be your cloned voice ID
		Format:          media.FormatMP3,
		Stability:       0.75, // Adjust for voice consistency
		SimilarityBoost: 0.90, // High similarity to original
	})
	if err != nil {
		log.Fatal(err)
	}
	defer stream.Close()

	// Collect synthesized audio
	var audioData []byte
	for chunk := range stream.Chunks() {
		audioData = append(audioData, chunk...)
	}

	// Step 3: Optionally transcribe the generated audio to verify
	verification, err := whisper.Transcribe(ctx, media.TranscriptionRequest{
		Audio: core.BlobRef{
			Kind:  core.BlobBytes,
			Bytes: audioData,
			MIME:  "audio/mpeg",
		},
	})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Printf("Generated text verification: %s\n", verification.Text)
	fmt.Printf("Match: %v\n", strings.Contains(verification.Text, "similar voice"))
}

Index

Examples

Constants

View Source
const (
	FormatMP3   = "mp3"
	FormatWAV   = "wav"
	FormatOGG   = "ogg"
	FormatOpus  = "opus"
	FormatFLAC  = "flac"
	FormatPCM   = "pcm"
	FormatWebM  = "webm"
	FormatMPEG  = "mpeg"
	FormatULaw  = "ulaw"
	FormatMuLaw = "mulaw"
)

Common audio formats

View Source
const (
	MimeMP3   = "audio/mpeg"
	MimeWAV   = "audio/wav"
	MimeOGG   = "audio/ogg"
	MimeOpus  = "audio/opus"
	MimeFLAC  = "audio/flac"
	MimeWebM  = "audio/webm"
	MimeBasic = "audio/basic" // for ulaw
)

Common MIME types

Variables

This section is empty.

Functions

func NewSpeakTool

func NewSpeakTool(provider SpeechProvider, opts ...SpeakToolOption) tools.Handle

NewSpeakTool creates a tool that allows LLMs to trigger TTS.

Example

Example demonstrates the Speak tool for LLM-triggered TTS

package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"time"

	"github.com/recera/gai/core"
	"github.com/recera/gai/media"
	"github.com/recera/gai/providers/openai"
	"github.com/recera/gai/tools"
)

func main() {
	// Create TTS provider
	tts := media.NewElevenLabs(
		media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
	)

	// Create Speak tool
	speakTool := media.NewSpeakTool(
		tts,
		media.WithSpeakToolTempDir("/tmp/audio"),
		media.WithSpeakToolMaxTextLength(1000),
		media.WithSpeakToolDefaultFormat(media.FormatMP3),
		media.WithSpeakToolCleanup(5*time.Minute),
	)

	// Create AI provider
	ai := openai.New(
		openai.WithAPIKey(os.Getenv("OPENAI_API_KEY")),
		openai.WithModel("gpt-4o-mini"),
	)

	// Use the speak tool in an AI request
	ctx := context.Background()
	result, err := ai.GenerateText(ctx, core.Request{
		Messages: []core.Message{
			{Role: core.System, Parts: []core.Part{
				core.Text{Text: "You are a helpful assistant. When asked to speak or say something aloud, use the speak tool."},
			}},
			{Role: core.User, Parts: []core.Part{
				core.Text{Text: "Please say 'Welcome to the GAI framework' out loud."},
			}},
		},
		Tools:      []core.ToolHandle{tools.NewCoreAdapter(speakTool)},
		ToolChoice: core.ToolAuto,
	})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Printf("AI Response: %s\n", result.Text)

	// The AI will have called the speak tool
	// Check the steps for tool execution results
	for _, step := range result.Steps {
		for _, execution := range step.ToolResults {
			if execution.Name == "speak" {
				// The result contains the audio file path or data URL
				fmt.Printf("Audio generated: %+v\n", execution.Result)
			}
		}
	}
}

Types

type AudioFormat

type AudioFormat struct {
	// MIME type (e.g., "audio/mpeg", "audio/wav").
	MIME string

	// Sample rate in Hz (e.g., 44100, 16000).
	SampleRate int

	// Number of channels (1 for mono, 2 for stereo).
	Channels int

	// Bit depth (e.g., 16, 24).
	BitDepth int

	// Encoding format (e.g., "pcm", "mp3", "opus").
	Encoding string

	// Bitrate in bits per second (for compressed formats).
	Bitrate int
}

AudioFormat describes audio encoding and properties.

type Cartesia

type Cartesia struct {
	// contains filtered or unexported fields
}

Cartesia implements SpeechProvider for Cartesia TTS API.

func NewCartesia

func NewCartesia(opts ...CartesiaOption) *Cartesia

NewCartesia creates a new Cartesia TTS provider.

func (*Cartesia) ListVoices

func (c *Cartesia) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns available Cartesia voices.

func (*Cartesia) Synthesize

func (c *Cartesia) Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)

Synthesize converts text to speech using Cartesia.

type CartesiaOption

type CartesiaOption func(*Cartesia)

CartesiaOption configures the Cartesia provider.

func WithCartesiaAPIKey

func WithCartesiaAPIKey(key string) CartesiaOption

WithCartesiaAPIKey sets the API key.

func WithCartesiaBaseURL

func WithCartesiaBaseURL(url string) CartesiaOption

WithCartesiaBaseURL sets a custom base URL.

func WithCartesiaModel

func WithCartesiaModel(model string) CartesiaOption

WithCartesiaModel sets the default model.

func WithCartesiaVoice

func WithCartesiaVoice(voice string) CartesiaOption

WithCartesiaVoice sets the default voice.

type Deepgram

type Deepgram struct {
	// contains filtered or unexported fields
}

Deepgram implements TranscriptionProvider for Deepgram API.

func NewDeepgram

func NewDeepgram(opts ...DeepgramOption) *Deepgram

NewDeepgram creates a new Deepgram STT provider.

func (*Deepgram) Transcribe

Transcribe converts audio to text using Deepgram.

Example

Example demonstrates real-time transcription with Deepgram

package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/recera/gai/core"
	"github.com/recera/gai/media"
)

func main() {
	// Create Deepgram provider
	deepgram := media.NewDeepgram(
		media.WithDeepgramAPIKey(os.Getenv("DEEPGRAM_API_KEY")),
		media.WithDeepgramModel("nova-2"),
	)

	// Transcribe with advanced features
	ctx := context.Background()
	result, err := deepgram.Transcribe(ctx, media.TranscriptionRequest{
		Audio: core.BlobRef{
			Kind:  core.BlobBytes,
			Bytes: getAudioBytes(), // Your audio data
			MIME:  "audio/wav",
		},
		Language:        "en",
		Punctuate:       true,
		Diarize:         true, // Enable speaker identification
		FilterProfanity: false,
		MaxAlternatives: 2,
	})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Printf("Transcription: %s\n", result.Text)
	fmt.Printf("Confidence: %.2f\n", result.Confidence)

	// Show speaker segments
	if len(result.Speakers) > 0 {
		fmt.Println("\nSpeaker segments:")
		for _, segment := range result.Speakers {
			fmt.Printf("  Speaker %d (%v-%v): %s\n",
				segment.Speaker, segment.Start, segment.End, segment.Text)
		}
	}

	// Show alternatives
	if len(result.Alternatives) > 0 {
		fmt.Println("\nAlternative transcriptions:")
		for i, alt := range result.Alternatives {
			fmt.Printf("  %d. (confidence %.2f): %s\n",
				i+1, alt.Confidence, alt.Text)
		}
	}
}

func getAudioBytes() []byte {

	return []byte("fake audio data for example")
}

func (*Deepgram) TranscribeStream

func (d *Deepgram) TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)

TranscribeStream processes streaming audio input using WebSocket.

type DeepgramOption

type DeepgramOption func(*Deepgram)

DeepgramOption configures the Deepgram provider.

func WithDeepgramAPIKey

func WithDeepgramAPIKey(key string) DeepgramOption

WithDeepgramAPIKey sets the API key.

func WithDeepgramBaseURL

func WithDeepgramBaseURL(url string) DeepgramOption

WithDeepgramBaseURL sets a custom base URL.

func WithDeepgramModel

func WithDeepgramModel(model string) DeepgramOption

WithDeepgramModel sets the default model.

type ElevenLabs

type ElevenLabs struct {
	// contains filtered or unexported fields
}

ElevenLabs implements SpeechProvider for ElevenLabs TTS API.

func NewElevenLabs

func NewElevenLabs(opts ...ElevenLabsOption) *ElevenLabs

NewElevenLabs creates a new ElevenLabs TTS provider.

func (*ElevenLabs) ListVoices

func (el *ElevenLabs) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns available ElevenLabs voices.

Example

Example demonstrates listing available voices

package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/recera/gai/media"
)

func main() {
	tts := media.NewElevenLabs(
		media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
	)

	ctx := context.Background()
	voices, err := tts.ListVoices(ctx)
	if err != nil {
		log.Fatal(err)
	}

	// Display first 3 voices
	for i, voice := range voices {
		if i >= 3 {
			break
		}
		fmt.Printf("Voice: %s\n", voice.Name)
		fmt.Printf("  ID: %s\n", voice.ID)
		fmt.Printf("  Gender: %s, Age: %s\n", voice.Gender, voice.Age)
		fmt.Printf("  Premium: %v\n", voice.Premium)
		fmt.Println()
	}
}

func (*ElevenLabs) Synthesize

func (el *ElevenLabs) Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)

Synthesize converts text to speech using ElevenLabs.

Example

Example demonstrates basic TTS synthesis with ElevenLabs

package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/recera/gai/media"
)

func main() {
	// Create TTS provider
	tts := media.NewElevenLabs(
		media.WithElevenLabsAPIKey(os.Getenv("ELEVENLABS_API_KEY")),
	)

	// Synthesize speech
	ctx := context.Background()
	stream, err := tts.Synthesize(ctx, media.SpeechRequest{
		Text:   "Hello from the GAI framework! This is a test of text-to-speech synthesis.",
		Voice:  "Rachel",
		Format: media.FormatMP3,
		Speed:  1.0,
	})
	if err != nil {
		log.Fatal(err)
	}
	defer stream.Close()

	// Collect audio data
	var totalBytes int
	for chunk := range stream.Chunks() {
		totalBytes += len(chunk)
		// In real use, write to file or stream to client
	}

	fmt.Printf("Generated %d bytes of MP3 audio\n", totalBytes)
}

type ElevenLabsOption

type ElevenLabsOption func(*ElevenLabs)

ElevenLabsOption configures the ElevenLabs provider.

func WithElevenLabsAPIKey

func WithElevenLabsAPIKey(key string) ElevenLabsOption

WithElevenLabsAPIKey sets the API key.

func WithElevenLabsBaseURL

func WithElevenLabsBaseURL(url string) ElevenLabsOption

WithElevenLabsBaseURL sets a custom base URL.

func WithElevenLabsModel

func WithElevenLabsModel(model string) ElevenLabsOption

WithElevenLabsModel sets the default model.

func WithElevenLabsVoice

func WithElevenLabsVoice(voice string) ElevenLabsOption

WithElevenLabsVoice sets the default voice.

type ProviderConfig

type ProviderConfig struct {
	// API key for authentication.
	APIKey string

	// Base URL for the API.
	BaseURL string

	// Organization ID (if applicable).
	Organization string

	// Project ID (if applicable).
	Project string

	// Default voice to use.
	DefaultVoice string

	// Default model to use.
	DefaultModel string

	// Default audio format.
	DefaultFormat string

	// Request timeout.
	Timeout time.Duration

	// Maximum retries for failed requests.
	MaxRetries int

	// Custom HTTP headers.
	Headers map[string]string
}

ProviderConfig holds common configuration for audio providers.

type SpeakInput

type SpeakInput struct {
	// Text to speak (required).
	Text string `json:"text" jsonschema:"required,description=The text to convert to speech"`

	// Voice to use (optional, uses provider default if not specified).
	Voice string `json:"voice,omitempty" jsonschema:"description=Voice ID or name to use for synthesis"`

	// Format for the output (optional, defaults to mp3).
	Format string `json:"format,omitempty" jsonschema:"enum=mp3,enum=wav,enum=ogg,description=Audio format for the output"`

	// Speed of speech (optional, 0.5 to 2.0, default 1.0).
	Speed float32 `json:"speed,omitempty" jsonschema:"minimum=0.5,maximum=2.0,description=Speaking speed (0.5 to 2.0)"`

	// Save to file (optional, if true saves to temp file and returns path).
	SaveToFile bool `json:"save_to_file,omitempty" jsonschema:"description=Save audio to a temporary file"`

	// Return as data URL (optional, if true returns base64 data URL).
	ReturnDataURL bool `json:"return_data_url,omitempty" jsonschema:"description=Return audio as a base64 data URL"`
}

SpeakInput defines the input for the Speak tool.

type SpeakOutput

type SpeakOutput struct {
	// Success indicates whether the speech synthesis succeeded.
	Success bool `json:"success"`

	// FilePath is the path to the saved audio file (if SaveToFile was true).
	FilePath string `json:"file_path,omitempty"`

	// DataURL is the base64-encoded data URL (if ReturnDataURL was true).
	DataURL string `json:"data_url,omitempty"`

	// Format of the audio.
	Format string `json:"format"`

	// Duration in seconds (estimated).
	DurationSeconds float64 `json:"duration_seconds,omitempty"`

	// Size in bytes.
	SizeBytes int `json:"size_bytes"`

	// Error message if synthesis failed.
	Error string `json:"error,omitempty"`
}

SpeakOutput defines the output from the Speak tool.

type SpeakToolOption

type SpeakToolOption func(*speakToolConfig)

SpeakToolOption configures the Speak tool.

func WithSpeakToolCleanup

func WithSpeakToolCleanup(duration time.Duration) SpeakToolOption

WithSpeakToolCleanup sets automatic file cleanup duration.

func WithSpeakToolDefaultFormat

func WithSpeakToolDefaultFormat(format string) SpeakToolOption

WithSpeakToolDefaultFormat sets the default audio format.

func WithSpeakToolMaxTextLength

func WithSpeakToolMaxTextLength(length int) SpeakToolOption

WithSpeakToolMaxTextLength sets the maximum text length.

func WithSpeakToolTempDir

func WithSpeakToolTempDir(dir string) SpeakToolOption

WithSpeakToolTempDir sets the temporary directory for audio files.

type SpeakerSegment

type SpeakerSegment struct {
	Speaker int
	Start   time.Duration
	End     time.Duration
	Text    string
}

SpeakerSegment identifies a speaker's portion of the audio.

type SpeechProvider

type SpeechProvider interface {
	// Synthesize converts text to speech, returning a stream of audio chunks.
	Synthesize(ctx context.Context, req SpeechRequest) (SpeechStream, error)

	// ListVoices returns available voices for this provider.
	ListVoices(ctx context.Context) ([]Voice, error)
}

SpeechProvider synthesizes text into speech audio.

type SpeechRequest

type SpeechRequest struct {
	// Text to synthesize (required).
	Text string

	// Voice ID or name (provider-specific).
	Voice string

	// Model to use (provider-specific, e.g., "eleven_multilingual_v2").
	Model string

	// Output format (e.g., "mp3", "pcm", "opus").
	Format string

	// Speaking speed (0.5 to 2.0, 1.0 is normal).
	Speed float32

	// Voice stability (provider-specific, 0.0 to 1.0).
	Stability float32

	// Voice similarity boost (provider-specific, 0.0 to 1.0).
	SimilarityBoost float32

	// Additional provider-specific options.
	Options map[string]any
}

SpeechRequest configures text-to-speech synthesis.

type SpeechStream

type SpeechStream interface {
	// Chunks returns a channel of audio data chunks.
	Chunks() <-chan []byte

	// Format returns the audio format information.
	Format() AudioFormat

	// Close stops the stream and releases resources.
	Close() error

	// Error returns any error that occurred during streaming.
	Error() error
}

SpeechStream provides streaming audio output from TTS.

type TranscriptionAlternative

type TranscriptionAlternative struct {
	Text       string
	Confidence float32
}

TranscriptionAlternative represents an alternative transcription.

type TranscriptionEvent

type TranscriptionEvent struct {
	// Type of event.
	Type TranscriptionEventType

	// Transcribed text (for partial and final results).
	Text string

	// Whether this is a final result.
	IsFinal bool

	// Word timing (if available).
	Words []WordTiming

	// Error (for error events).
	Error error
}

TranscriptionEvent represents a real-time transcription update.

type TranscriptionEventType

type TranscriptionEventType int

TranscriptionEventType identifies the type of transcription event.

const (
	TranscriptionPartial TranscriptionEventType = iota
	TranscriptionFinal
	TranscriptionError
	TranscriptionEnd
)

type TranscriptionProvider

type TranscriptionProvider interface {
	// Transcribe converts audio to text.
	Transcribe(ctx context.Context, req TranscriptionRequest) (*TranscriptionResult, error)

	// TranscribeStream processes streaming audio input.
	TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)
}

TranscriptionProvider converts speech audio to text.

type TranscriptionRequest

type TranscriptionRequest struct {
	// Audio input source.
	Audio core.BlobRef

	// Language code (e.g., "en", "es", "fr").
	Language string

	// Model to use (provider-specific).
	Model string

	// Enable punctuation restoration.
	Punctuate bool

	// Enable speaker diarization.
	Diarize bool

	// Enable profanity filtering.
	FilterProfanity bool

	// Custom vocabulary/keywords.
	Keywords []string

	// Maximum alternatives to return.
	MaxAlternatives int

	// Additional provider-specific options.
	Options map[string]any
}

TranscriptionRequest configures speech-to-text transcription.

type TranscriptionResult

type TranscriptionResult struct {
	// Primary transcription text.
	Text string

	// Alternative transcriptions with confidence scores.
	Alternatives []TranscriptionAlternative

	// Word-level timing information.
	Words []WordTiming

	// Detected language (if auto-detected).
	Language string

	// Overall confidence score (0.0 to 1.0).
	Confidence float32

	// Duration of the audio.
	Duration time.Duration

	// Speaker segments (if diarization enabled).
	Speakers []SpeakerSegment
}

TranscriptionResult contains the transcribed text and metadata.

type TranscriptionStream

type TranscriptionStream interface {
	// Events returns a channel of transcription events.
	Events() <-chan TranscriptionEvent

	// Close stops the stream and releases resources.
	Close() error
}

TranscriptionStream provides real-time transcription of streaming audio.

type Voice

type Voice struct {
	// Unique voice identifier.
	ID string

	// Human-readable voice name.
	Name string

	// Voice description.
	Description string

	// Language codes supported by this voice.
	Languages []string

	// Voice gender (if specified).
	Gender string

	// Voice age category (e.g., "young", "middle-aged", "old").
	Age string

	// Voice style/use case tags (e.g., "conversational", "narrative").
	Tags []string

	// Preview audio URL (if available).
	PreviewURL string

	// Whether this is a premium voice.
	Premium bool
}

Voice represents an available TTS voice.

type Whisper

type Whisper struct {
	// contains filtered or unexported fields
}

Whisper implements TranscriptionProvider for OpenAI Whisper API or compatible servers.

func NewWhisper

func NewWhisper(opts ...WhisperOption) *Whisper

NewWhisper creates a new Whisper STT provider.

func (*Whisper) Transcribe

Transcribe converts audio to text using Whisper.

Example

Example demonstrates transcribing audio with Whisper

package main

import (
	"context"
	"fmt"
	"log"
	"os"

	"github.com/recera/gai/core"
	"github.com/recera/gai/media"
)

func main() {
	// Create STT provider
	stt := media.NewWhisper(
		media.WithWhisperAPIKey(os.Getenv("OPENAI_API_KEY")),
	)

	// Transcribe audio from URL
	ctx := context.Background()
	result, err := stt.Transcribe(ctx, media.TranscriptionRequest{
		Audio: core.BlobRef{
			Kind: core.BlobURL,
			URL:  "https://example.com/sample-audio.wav",
			MIME: "audio/wav",
		},
		Language:  "en",
		Punctuate: true,
		Keywords:  []string{"GAI", "framework", "artificial intelligence"},
	})
	if err != nil {
		log.Fatal(err)
	}

	fmt.Printf("Transcription: %s\n", result.Text)
	fmt.Printf("Language: %s\n", result.Language)
	fmt.Printf("Duration: %v\n", result.Duration)

	// Show word timings if available
	if len(result.Words) > 0 {
		fmt.Println("\nFirst 5 words with timing:")
		for i, word := range result.Words {
			if i >= 5 {
				break
			}
			fmt.Printf("  %s: %v-%v\n", word.Word, word.Start, word.End)
		}
	}
}

func (*Whisper) TranscribeStream

func (w *Whisper) TranscribeStream(ctx context.Context, audio io.Reader) (TranscriptionStream, error)

TranscribeStream processes streaming audio input (not supported by standard Whisper).

type WhisperOption

type WhisperOption func(*Whisper)

WhisperOption configures the Whisper provider.

func WithWhisperAPIKey

func WithWhisperAPIKey(key string) WhisperOption

WithWhisperAPIKey sets the API key.

func WithWhisperBaseURL

func WithWhisperBaseURL(url string) WhisperOption

WithWhisperBaseURL sets a custom base URL (for self-hosted Whisper).

func WithWhisperModel

func WithWhisperModel(model string) WhisperOption

WithWhisperModel sets the default model.

func WithWhisperOrganization

func WithWhisperOrganization(org string) WhisperOption

WithWhisperOrganization sets the OpenAI organization ID.

type WordTiming

type WordTiming struct {
	Word       string
	Start      time.Duration
	End        time.Duration
	Confidence float32
	Speaker    int // Speaker ID if diarization enabled
}

WordTiming provides timing information for individual words.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL