polyphon

package

v0.9.0 Latest Latest Go to latest Published: May 30, 2026 License: Apache-2.0 Imports: 17 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/znasllc-io/memql

Links

Open Source Insights

Documentation ¶

Overview ¶

Package polyphon implements the Polyphon multi-agent voice conversation orchestration system. It scores multiple SI agents, manages turn-taking, and decides which agent should respond to human utterances.

Polyphon (from Greek "polyphōnía" — many voices) enables up to 3 SI agents and 5 humans to participate in natural real-time voice conversations.

Index ¶

Constants
Variables
func EstimateTokens(text string) int
func ExpandDomains(domains []string) []string
func LookupContextWindow(model string) int
func SplitSentences(text string) []string
type ASRConfig
- func DefaultASRConfig() ASRConfig
type ASRProvider
type ASRResult
type ASRResultKind
type ASRStream
type AgentCandidate
type AgentRoomConfig
type AgentScore
type CompactedContext
type CompactionLevel
- func DetermineCompactionLevel(estimatedTokens, availableBudget int) CompactionLevel
type Config
- func ConfigFromEnv() Config
- func (c Config) LiveKitConfigured() bool
type ContextBuilder
- func NewContextBuilder() *ContextBuilder
- func (cb *ContextBuilder) BuildForAgent(session *PolyphonSession, agentId string) *CompactedContext
- func (cb *ContextBuilder) BuildForAgentSemantic(session *PolyphonSession, agentId string, utteranceEmbedding []float32, ...) *CompactedContext
- func (cb *ContextBuilder) BuildForAgentWithBudget(session *PolyphonSession, agentId string, budget TokenBudget) *CompactedContext
- func (cb *ContextBuilder) EntriesForCompaction(session *PolyphonSession) []TranscriptEntry
- func (cb *ContextBuilder) TriggerCompaction(ctx context.Context, session *PolyphonSession, ...)
- func (cb *ContextBuilder) UpdateSummary(session *PolyphonSession, summary string, upToIndex int)
type ConversationPhase
type ConversationStateMachine
- func NewConversationStateMachine() *ConversationStateMachine
- func (sm *ConversationStateMachine) LastTransition() time.Time
- func (sm *ConversationStateMachine) Phase() ConversationPhase
- func (sm *ConversationStateMachine) Transition(intent IntentType) ConversationPhase
type DefaultHeartbeatEvaluator
- func NewDefaultHeartbeatEvaluator() *DefaultHeartbeatEvaluator
- func (e *DefaultHeartbeatEvaluator) Evaluate(session *PolyphonSession, candidates []AgentCandidate) *HeartbeatAction
type ExternalPredictiveAnalyzer
- func NewExternalPredictiveAnalyzer(endpoint string, fallback PredictiveAnalyzer) *ExternalPredictiveAnalyzer
- func (a *ExternalPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
type HeartbeatAction
type HeartbeatEvaluator
type IntentResult
- func ClassifyIntent(text string, mentions []Mention, prevEntry *TranscriptEntry) *IntentResult
type IntentType
type LocalRoomProvider
- func NewLocalRoomProvider(cfg Config) *LocalRoomProvider
- func (p *LocalRoomProvider) CreateRoom(_ context.Context, _ string, _ []AgentRoomConfig) (*RoomInfo, error)
- func (p *LocalRoomProvider) DestroyRoom(_ context.Context, _ string) error
- func (p *LocalRoomProvider) GenerateToken(_ context.Context, spaceId, participantId, displayName string) (*RoomToken, error)
- func (p *LocalRoomProvider) GetRoomInfo(_ context.Context, _ string) (*RoomInfo, error)
type Mention
- func ParseMentions(text string, participants []ParticipantRef) ([]Mention, string)
type MentionRole
type ModelPredictiveAnalyzer
- func NewModelPredictiveAnalyzer(...) *ModelPredictiveAnalyzer
- func (a *ModelPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
type NoOpPredictiveAnalyzer
- func (n *NoOpPredictiveAnalyzer) Analyze(_ context.Context, _ *PolyphonSession, _ []AgentCandidate) (*PredictiveState, error)
type ParticipantRef
type PhasePatternLibrary
- func NewPhasePatternLibrary() *PhasePatternLibrary
- func (lib *PhasePatternLibrary) DetectPhase(utteranceEmbedding []float32) (ConversationPhase, float64)
- func (lib *PhasePatternLibrary) InitEmbeddings(ctx context.Context, ...) error
type PolyphonSession
- func NewSession(spaceId string) *PolyphonSession
- func (s *PolyphonSession) AddTranscript(entry TranscriptEntry)
- func (s *PolyphonSession) AgentTurnsSinceHuman() int
- func (s *PolyphonSession) GetAddressedAgent() (string, time.Time)
- func (s *PolyphonSession) GetHandoffChainDepth() int
- func (s *PolyphonSession) GetPrediction() *PredictiveState
- func (s *PolyphonSession) LastAgentSpokeAt(agentId string) time.Time
- func (s *PolyphonSession) RecentTranscript(n int) []TranscriptEntry
- func (s *PolyphonSession) RecordHandoffOutcome(handoff bool)
- func (s *PolyphonSession) SetAddressedAgent(agentId string)
- func (s *PolyphonSession) SetPrediction(state *PredictiveState)
type PredictiveAnalyzer
type PredictiveState
type RoomInfo
type RoomParticipant
type RoomProvider
type RoomToken
type ScoreDecision
- func (d *ScoreDecision) HasWinner() bool
type ScoreEngine
- func NewScoreEngine(logger *slog.Logger) *ScoreEngine
- func NewScoreEngineWithConfig(weights ScoringWeights, policyConfig TurnPolicyConfig, logger *slog.Logger) *ScoreEngine
- func (c *ScoreEngine) Policy() *TurnPolicy
- func (c *ScoreEngine) ProcessUtterance(_ context.Context, utterance Utterance, candidates []AgentCandidate) *ScoreDecision
- func (c *ScoreEngine) RecordAgentResponse(spaceId string, agentId, agentName, responseText string, ...) (bool, *AgentScore)
- func (c *ScoreEngine) Scorer() *Scorer
- func (c *ScoreEngine) Sessions() *SessionManager
type Scorer
- func NewScorer(weights ScoringWeights, policy TurnPolicyConfig) *Scorer
- func (s *Scorer) ScoreAll(candidates []AgentCandidate, utterance Utterance, session *PolyphonSession) []AgentScore
type ScoringFactor
type ScoringWeights
- func DefaultScoringWeights() ScoringWeights
type SessionAgent
type SessionManager
- func NewSessionManager(logger *slog.Logger) *SessionManager
- func (m *SessionManager) ActiveSessions() int
- func (m *SessionManager) AddAgent(spaceId string, agentId, participantId, name string) error
- func (m *SessionManager) AddHuman(spaceId string, participantId, displayName string) error
- func (m *SessionManager) Get(spaceId string) *PolyphonSession
- func (m *SessionManager) GetOrCreate(spaceId string) *PolyphonSession
- func (m *SessionManager) HasHeartbeat(spaceId string) bool
- func (m *SessionManager) HasHumans(spaceId string) bool
- func (m *SessionManager) HasPrediction(spaceId string) bool
- func (m *SessionManager) RecordUtterance(spaceId string, entry TranscriptEntry)
- func (m *SessionManager) Remove(spaceId string) *PolyphonSession
- func (m *SessionManager) RemoveAgent(spaceId, agentId string)
- func (m *SessionManager) RemoveHuman(spaceId, participantId string)
- func (m *SessionManager) StartHeartbeat(spaceId string, interval time.Duration, evaluator HeartbeatEvaluator, ...)
- func (m *SessionManager) StartPrediction(spaceId string, interval time.Duration, analyzer PredictiveAnalyzer, ...)
- func (m *SessionManager) StopHeartbeat(spaceId string)
- func (m *SessionManager) StopPrediction(spaceId string)
type SessionParticipant
type TTSChunk
type TTSConfig
- func DefaultTTSConfig(text, voiceModel string) TTSConfig
type TTSProvider
type TTSStream
type TokenBudget
- func CalculateTokenBudget(contextWindow, maxOutput, systemPromptEst int) TokenBudget
type TranscriptEntry
type TurnPolicy
- func NewTurnPolicy(cfg TurnPolicyConfig) *TurnPolicy
- func (p *TurnPolicy) CanAgentRespond(agentId string, score float64, session *PolyphonSession) bool
- func (p *TurnPolicy) EvaluateScores(scores []AgentScore, session *PolyphonSession) []AgentScore
- func (p *TurnPolicy) ResponseDelay() time.Duration
- func (p *TurnPolicy) ShouldContinue(scores []AgentScore, session *PolyphonSession) (bool, *AgentScore)
type TurnPolicyConfig
- func DefaultTurnPolicy() TurnPolicyConfig
type Utterance
type VoiceInfo
type VoicePlatform

Constants ¶

View Source

const MaxAgentsPerSpace = 3

MaxAgentsPerSpace is the maximum number of SI agents in a Polyphon session.

View Source

const MaxHumansPerSpace = 5

MaxHumansPerSpace is the maximum number of human participants in a Polyphon session.

Variables ¶

View Source

var DefaultContextWindows = map[string]int{

	"gpt-5.4":             128000,
	"gpt-5.4-pro":         256000,
	"gpt-5.4-mini":        128000,
	"gpt-5.4-nano":        32000,
	"gpt-5.3-chat-latest": 128000,

	"o4-mini": 200000,

	"gpt-5.3-codex":     256000,
	"gpt-5.1-codex-max": 400000,

	"gpt-5-search-api": 128000,

	"o4-mini-deep-research": 200000,
	"o3-deep-research":      200000,

	"claude-opus-4-6":   200000,
	"claude-sonnet-4-6": 200000,
	"claude-haiku-4-5":  200000,
}

DefaultContextWindows maps known model prefixes to their context window sizes. Used as fallback when contextWindow is not specified in the provider config. Keep this in sync with providers/v1/openai/*.memql -- the provider file's contextWindow param is the source of truth at runtime; this table is consulted only when a caller asks for the budget without a loaded provider config.

Functions ¶

func EstimateTokens ¶

func EstimateTokens(text string) int

EstimateTokens provides a fast token estimate (characters / 4). Within ~20% of actual token count for English text.

func ExpandDomains ¶

func ExpandDomains(domains []string) []string

ExpandDomains returns the input domains plus common synonyms and aliases. Duplicates are removed. Original terms are always preserved.

func LookupContextWindow ¶

func LookupContextWindow(model string) int

LookupContextWindow returns the context window for a model name. First checks the lookup table by exact match, then by prefix match. Falls back to 32000 for unknown models (conservative default).

func SplitSentences ¶

func SplitSentences(text string) []string

SplitSentences splits text into sentences at natural boundaries.

Rules:

Split on . ! ? followed by whitespace or end-of-string
Don't split abbreviations (Mr., Dr., e.g., i.e., etc.)
Don't split decimal numbers (3.14, $5.99)
Don't split inside quoted text
Merge very short sentences (<20 chars) with the next sentence
Flush remaining text as final sentence (even without punctuation)

Types ¶

type ASRConfig ¶

type ASRConfig struct {
	// SampleRate in Hz (default: 16000).
	SampleRate int `json:"sampleRate"`

	// Language is a BCP-47 language code (e.g., "en-US", "es-MX").
	Language string `json:"language"`

	// EnableDiarization enables speaker identification in the transcript.
	EnableDiarization bool `json:"enableDiarization"`

	// MaxSpeakers is the maximum number of speakers for diarization.
	MaxSpeakers int `json:"maxSpeakers"`
}

ASRConfig configures a streaming ASR session.

func DefaultASRConfig ¶

func DefaultASRConfig() ASRConfig

DefaultASRConfig returns sensible defaults for ASR configuration.

type ASRProvider ¶

type ASRProvider interface {
	// StartStream begins a new streaming transcription session.
	// Audio chunks are sent via SendAudio, and transcription results
	// arrive on the Results channel.
	StartStream(ctx context.Context, config ASRConfig) (ASRStream, error)
}

ASRProvider abstracts speech-to-text transcription. The implementation is OpenAI Realtime in Stage 1; Stage 2 of the Deepgram migration adds Deepgram Nova-3 as the new default.

type ASRResult ¶

type ASRResult struct {
	Text       string  `json:"text"`
	IsFinal    bool    `json:"isFinal"`
	Confidence float64 `json:"confidence"`
	SpeakerId  string  `json:"speakerId,omitempty"` // Only if diarization enabled
	// Kind discriminates the turn-structure role of this result.
	// The zero value (ASRKindTranscript) preserves the historical
	// Text/IsFinal behavior; ASRKindSpeechStarted carries an onset
	// signal with no text. Additive for the turn-taking machine (#455);
	// transcript-only consumers ignore it.
	Kind ASRResultKind `json:"kind,omitempty"`
}

ASRResult is a transcription result from the ASR provider.

type ASRResultKind ¶

type ASRResultKind int

ASRResultKind discriminates the turn-structure role of an ASRResult. It is an additive, backward-compatible enrichment of the stream contract (see docs/voice/452-turntaking-orchestration-go.md, step 1): the zero value (ASRKindTranscript) preserves the historical interim/final-via-IsFinal behavior, so existing consumers that only read Text/IsFinal are unaffected. The Go turn-taking machine (#455) additionally reads Kind to drive barge-in onset off ASRKindSpeechStarted.

const (
	// ASRKindTranscript is the default kind: a transcript update whose
	// finality is carried by IsFinal (interim when false, committed
	// end-of-utterance when true). This is the only kind pre-#455
	// providers emitted, so it is the zero value for backward
	// compatibility.
	ASRKindTranscript ASRResultKind = iota
	// ASRKindSpeechStarted marks a voice-activity onset (Deepgram's
	// SpeechStarted VAD event). It carries no transcript text; it is the
	// signal the turn-taking machine uses to enter human-turn and to
	// raise a barge-in candidate while the assistant has the floor.
	ASRKindSpeechStarted
)

type ASRStream ¶

type ASRStream interface {
	// SendAudio sends a chunk of audio data for transcription.
	// Audio must be PCM16, 16kHz, mono.
	SendAudio(audio []byte) error

	// Results returns a channel of transcription results.
	// Intermediate (non-final) results may arrive before the final transcript.
	Results() <-chan ASRResult

	// Close terminates the stream and releases resources.
	Close() error
}

ASRStream represents an active streaming transcription session.

type AgentCandidate ¶

type AgentCandidate struct {
	ID                   string                 `json:"id"`
	ParticipantId        string                 `json:"participantId"`
	Name                 string                 `json:"name"`
	Description          string                 `json:"description,omitempty"`
	Personality          string                 `json:"personality,omitempty"`
	Role                 string                 `json:"role,omitempty"` // e.g. "assistant", "accounting-finance"
	Domains              []string               `json:"domains,omitempty"`
	Keywords             []string               `json:"keywords,omitempty"`
	SpeakWhen            string                 `json:"speakWhen,omitempty"`            // "asked", "relevant", "always"
	InterruptStyle       string                 `json:"interruptStyle,omitempty"`       // "polite", "assertive", "passive"
	ContinuationBehavior string                 `json:"continuationBehavior,omitempty"` // "add_context", "disagree", "summarize"
	ProviderConfig       map[string]interface{} `json:"providerConfig,omitempty"`
	Capabilities         map[string]interface{} `json:"capabilities,omitempty"`
	ProfileEmbedding     []float32              `json:"-"` // Pre-loaded from node_vectors for vector domain scoring
}

AgentCandidate represents an SI agent being evaluated by the score engine.

type AgentRoomConfig ¶

type AgentRoomConfig struct {
	AgentId    string `json:"agentId"`
	Name       string `json:"name"`
	VoiceModel string `json:"voiceModel"`
}

AgentRoomConfig describes an SI agent to be added to a room.

type AgentScore ¶

type AgentScore struct {
	AgentId       string          `json:"agentId"`
	AgentName     string          `json:"agentName"`
	TotalScore    float64         `json:"totalScore"` // Weighted sum (0–100)
	Factors       []ScoringFactor `json:"factors"`
	ShouldRespond bool            `json:"shouldRespond"`
	Reason        string          `json:"reason"`
	ToolsNeeded   bool            `json:"toolsNeeded,omitempty"` // SI router signals the agent needs tools for this utterance
}

AgentScore is the scoring result for a single agent candidate.

type CompactedContext ¶

type CompactedContext struct {
	Summary        string            // Zone 1
	RecentMessages []TranscriptEntry // Zone 2
	AgentMessages  []TranscriptEntry // Zone 3
	TotalTokensEst int
}

CompactedContext is an agent-scoped view of the conversation history. Instead of passing all messages to the AI, it provides 3 zones: Zone 1 (Summary): compressed old history Zone 2 (Recent): last N messages verbatim Zone 3 (Agent-Relevant): this agent's own exchanges

type CompactionLevel ¶

type CompactionLevel int

CompactionLevel determines how aggressively to compact based on budget usage.

const (
	CompactionNormal     CompactionLevel = iota // Budget < 70%: full context
	CompactionAggressive                        // 70-90%: reduced context
	CompactionEmergency                         // 90-100%: minimal context
	CompactionReset                             // Overflow: start fresh
)

func DetermineCompactionLevel ¶

func DetermineCompactionLevel(estimatedTokens, availableBudget int) CompactionLevel

DetermineCompactionLevel returns the appropriate compaction level based on estimated token usage vs available budget.

type Config ¶

type Config struct {
	// LiveKitURL is the LiveKit server WebSocket URL (used for server-side connections).
	// Set via POLYPHON_LIVEKIT_URL environment variable.
	LiveKitURL string

	// LiveKitPublicURL is the browser-reachable LiveKit WebSocket URL.
	// In Docker, the internal hostname (e.g. ws://livekit:7880) is not reachable
	// from the browser, so this provides the external URL (e.g. ws://localhost:7880).
	// Falls back to LiveKitURL if not set.
	// Set via POLYPHON_LIVEKIT_PUBLIC_URL environment variable.
	LiveKitPublicURL string

	// LiveKitAPIKey is the LiveKit API key.
	// Set via POLYPHON_LIVEKIT_API_KEY environment variable.
	LiveKitAPIKey string

	// LiveKitAPISecret is the LiveKit API secret.
	// Set via POLYPHON_LIVEKIT_API_SECRET environment variable.
	LiveKitAPISecret string

	// Scoring weights override (optional).
	Weights *ScoringWeights

	// Turn policy override (optional).
	Policy *TurnPolicyConfig
}

Config holds the Polyphon LiveKit transport configuration. The score engine is always active. This config only controls the LiveKit transport layer for multi-agent voice rooms.

func ConfigFromEnv ¶

func ConfigFromEnv() Config

ConfigFromEnv loads Polyphon configuration from environment variables.

func (Config) LiveKitConfigured ¶

func (c Config) LiveKitConfigured() bool

LiveKitConfigured returns true when all LiveKit credentials are present. The transport layer (room tokens, voice rooms) is only available when LiveKit URL (must start with ws:// or wss://), API key, and API secret are all set.

type ContextBuilder ¶

type ContextBuilder struct {
	// contains filtered or unexported fields
}

ContextBuilder constructs agent-scoped context views from session transcripts.

func NewContextBuilder ¶

func NewContextBuilder() *ContextBuilder

NewContextBuilder creates a context builder with defaults.

func (*ContextBuilder) BuildForAgent ¶

func (cb *ContextBuilder) BuildForAgent(session *PolyphonSession, agentId string) *CompactedContext

BuildForAgent constructs a compacted context for a specific agent.

func (*ContextBuilder) BuildForAgentSemantic ¶

func (cb *ContextBuilder) BuildForAgentSemantic(
	session *PolyphonSession,
	agentId string,
	utteranceEmbedding []float32,
	agentProfileEmbedding []float32,
	findRelevant func(combinedEmbedding []float32, limit int) ([]TranscriptEntry, error),
) *CompactedContext

BuildForAgentSemantic constructs a context where Zone 3 uses semantic similarity instead of speaker-ID filtering. Falls back to BuildForAgent when embeddings are unavailable.

The findRelevant function queries node_vectors for utterances similar to the combined agent profile + current utterance embedding.

func (*ContextBuilder) BuildForAgentWithBudget ¶

func (cb *ContextBuilder) BuildForAgentWithBudget(session *PolyphonSession, agentId string, budget TokenBudget) *CompactedContext

BuildForAgentWithBudget constructs a compacted context that fits within the given token budget. Uses 4-level compaction escalation.

func (*ContextBuilder) EntriesForCompaction ¶

func (cb *ContextBuilder) EntriesForCompaction(session *PolyphonSession) []TranscriptEntry

EntriesForCompaction returns transcript entries that have not been summarized yet and are outside the recent window.

func (*ContextBuilder) TriggerCompaction ¶

func (cb *ContextBuilder) TriggerCompaction(
	ctx context.Context,
	session *PolyphonSession,
	invokeAI func(ctx context.Context, templateId string, data map[string]any) (any, error),
)

TriggerCompaction runs AI summarization on old transcript entries. Debounced: skips if compaction ran within the last 30 seconds. Non-blocking: designed to be called from a fire-and-forget goroutine.

func (*ContextBuilder) UpdateSummary ¶

func (cb *ContextBuilder) UpdateSummary(session *PolyphonSession, summary string, upToIndex int)

UpdateSummary stores a new compacted summary in the session.

type ConversationPhase ¶

type ConversationPhase string

ConversationPhase represents the current phase of a group conversation.

const (
	PhaseGreeting    ConversationPhase = "greeting"
	PhaseExploration ConversationPhase = "exploration"
	PhaseTask        ConversationPhase = "task"
	PhaseFollowUp    ConversationPhase = "follow_up"
	PhaseWindingDown ConversationPhase = "winding_down"
)

type ConversationStateMachine ¶

type ConversationStateMachine struct {
	// contains filtered or unexported fields
}

ConversationStateMachine tracks the conversation phase and handles transitions.

func NewConversationStateMachine ¶

func NewConversationStateMachine() *ConversationStateMachine

NewConversationStateMachine creates a state machine starting in the greeting phase.

func (*ConversationStateMachine) LastTransition ¶

func (sm *ConversationStateMachine) LastTransition() time.Time

LastTransition returns when the last phase change occurred.

func (*ConversationStateMachine) Phase ¶

func (sm *ConversationStateMachine) Phase() ConversationPhase

Phase returns the current conversation phase.

func (*ConversationStateMachine) Transition ¶

func (sm *ConversationStateMachine) Transition(intent IntentType) ConversationPhase

Transition updates the conversation phase based on the detected intent. Returns the new phase after the transition.

type DefaultHeartbeatEvaluator ¶

type DefaultHeartbeatEvaluator struct {
	// SilenceThreshold is how long silence must last before re-engagement.
	// Default: 3 minutes.
	SilenceThreshold time.Duration
	// ClarifyingQuestionGrace is how long to wait after an agent asks a
	// question before considering re-engagement (the human may be typing).
	// Default: 30 seconds.
	ClarifyingQuestionGrace time.Duration
}

DefaultHeartbeatEvaluator implements HeartbeatEvaluator with rule-based proactive behavior. It checks for silence, pending events, and predictive state to decide whether the score engine should take action.

func NewDefaultHeartbeatEvaluator ¶

func NewDefaultHeartbeatEvaluator() *DefaultHeartbeatEvaluator

NewDefaultHeartbeatEvaluator creates a heartbeat evaluator with sensible defaults.

func (*DefaultHeartbeatEvaluator) Evaluate ¶

func (e *DefaultHeartbeatEvaluator) Evaluate(session *PolyphonSession, candidates []AgentCandidate) *HeartbeatAction

Evaluate checks the session state and returns a HeartbeatAction. Returns nil if no action should be taken (idle).

type ExternalPredictiveAnalyzer ¶

type ExternalPredictiveAnalyzer struct {
	// contains filtered or unexported fields
}

ExternalPredictiveAnalyzer calls an external HTTP endpoint for prediction. Activated when POLYPHON_PREDICTION_ENGINE_URL is set.

func NewExternalPredictiveAnalyzer ¶

func NewExternalPredictiveAnalyzer(endpoint string, fallback PredictiveAnalyzer) *ExternalPredictiveAnalyzer

NewExternalPredictiveAnalyzer creates an analyzer that calls an external service.

func (*ExternalPredictiveAnalyzer) Analyze ¶

func (a *ExternalPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)

type HeartbeatAction ¶

type HeartbeatAction struct {
	Type      string         `json:"type"`    // "idle", "re-engage", "notify", "proactive"
	AgentId   string         `json:"agentId"` // Which agent should act
	AgentName string         `json:"agentName"`
	Reason    string         `json:"reason"`            // Why this action was chosen
	Payload   map[string]any `json:"payload,omitempty"` // Action-specific data
}

HeartbeatAction represents a proactive action the score engine can take during a heartbeat tick. The heartbeat runs continuously per active space, evaluating whether the score engine should intervene.

type HeartbeatEvaluator ¶

type HeartbeatEvaluator interface {
	Evaluate(session *PolyphonSession, candidates []AgentCandidate) *HeartbeatAction
}

HeartbeatEvaluator evaluates whether the score engine should take proactive action during a heartbeat tick. Implementations can be swapped for different strategies (e.g., rule-based, SI-powered, hybrid).

type IntentResult ¶

type IntentResult struct {
	Primary    IntentType `json:"primary"`
	Secondary  IntentType `json:"secondary,omitempty"`
	Confidence float64    `json:"confidence"`
}

IntentResult holds the classification output.

func ClassifyIntent ¶

func ClassifyIntent(text string, mentions []Mention, prevEntry *TranscriptEntry) *IntentResult

ClassifyIntent determines the intent of an utterance using rule-based heuristics. Sub-10ms, no model call. Upgradeable to model-based later.

type IntentType ¶

type IntentType string

IntentType classifies the purpose of a human utterance.

const (
	IntentGreeting           IntentType = "greeting"
	IntentFollowUp           IntentType = "follow_up"
	IntentDomainQuestion     IntentType = "domain_question"
	IntentCapabilityQuestion IntentType = "capability_question"
	IntentTaskRequest        IntentType = "task_request"
	IntentDirectAddress      IntentType = "direct_address"
	IntentAffirmation        IntentType = "affirmation"
	IntentFarewell           IntentType = "farewell"
)

type LocalRoomProvider ¶

type LocalRoomProvider struct {
	// contains filtered or unexported fields
}

LocalRoomProvider generates LiveKit JWT tokens directly in the memQL process. This allows browser participants to join LiveKit rooms without the Bridge Agent running. Room/agent management operations are not supported -- they require the Bridge Agent.

func NewLocalRoomProvider ¶

func NewLocalRoomProvider(cfg Config) *LocalRoomProvider

NewLocalRoomProvider creates a provider that mints LiveKit access tokens using the credentials in cfg. The caller should verify cfg.LiveKitConfigured() before constructing this provider.

func (*LocalRoomProvider) CreateRoom ¶

func (p *LocalRoomProvider) CreateRoom(_ context.Context, _ string, _ []AgentRoomConfig) (*RoomInfo, error)

CreateRoom is not supported without the Bridge Agent.

func (*LocalRoomProvider) DestroyRoom ¶

func (p *LocalRoomProvider) DestroyRoom(_ context.Context, _ string) error

DestroyRoom is not supported without the Bridge Agent.

func (*LocalRoomProvider) GenerateToken ¶

func (p *LocalRoomProvider) GenerateToken(_ context.Context, spaceId, participantId, displayName string) (*RoomToken, error)

GenerateToken creates a LiveKit JWT for a participant to join a room. Room name follows the convention polyphon-{spaceId}.

func (*LocalRoomProvider) GetRoomInfo ¶

func (p *LocalRoomProvider) GetRoomInfo(_ context.Context, _ string) (*RoomInfo, error)

GetRoomInfo is not supported without the Bridge Agent.

type Mention ¶

type Mention struct {
	ParticipantId   string      `json:"participantId"`
	Name            string      `json:"name"`
	ParticipantType string      `json:"participantType"`
	Position        string      `json:"position"` // "start" or "mid"
	Role            MentionRole `json:"role"`
}

Mention represents a detected @ mention in an utterance.

func ParseMentions ¶

func ParseMentions(text string, participants []ParticipantRef) ([]Mention, string)

ParseMentions scans utterance text for @{name} tokens and matches them against the participant roster. Returns detected mentions and cleaned text with @ prefixes removed for downstream scoring. Mentions carry ParticipantType so the router can distinguish "addressed an agent" (that agent responds) from "addressed a human" (AI stays silent).

type MentionRole ¶

type MentionRole string

MentionRole indicates how a participant was mentioned in an utterance.

const (
	MentionRoleAddressee MentionRole = "addressee"
	MentionRoleReference MentionRole = "reference"
)

type ModelPredictiveAnalyzer ¶

type ModelPredictiveAnalyzer struct {
	// contains filtered or unexported fields
}

ModelPredictiveAnalyzer uses a fast SI model to predict conversation trajectory. Runs asynchronously via the prediction goroutine (every 30s per active space).

func NewModelPredictiveAnalyzer ¶

func NewModelPredictiveAnalyzer(
	invokeSI func(ctx context.Context, templateId string, data map[string]any) (any, error),
	embedFunc func(ctx context.Context, text string) ([]float32, error),
) *ModelPredictiveAnalyzer

NewModelPredictiveAnalyzer creates an analyzer that uses SI for prediction. The embedFunc parameter is optional (pass nil to disable vector-based phase detection).

func (*ModelPredictiveAnalyzer) Analyze ¶

func (a *ModelPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)

type NoOpPredictiveAnalyzer ¶

type NoOpPredictiveAnalyzer struct{}

NoOpPredictiveAnalyzer is a placeholder that returns no prediction. Used when no SI engine is available (tests, standalone mode).

func (*NoOpPredictiveAnalyzer) Analyze ¶

func (n *NoOpPredictiveAnalyzer) Analyze(_ context.Context, _ *PolyphonSession, _ []AgentCandidate) (*PredictiveState, error)

type ParticipantRef ¶

type ParticipantRef struct {
	ID              string
	Name            string
	ParticipantType string // "agent" or "human"
}

ParticipantRef is a minimal mention-lookup view of a participant. Both agents and humans fit into it so ParseMentions can detect @-mentions that target either. The handler assembles a combined slice from agent candidates plus the human roster before calling.

type PhasePatternLibrary ¶

type PhasePatternLibrary struct {
	// contains filtered or unexported fields
}

PhasePatternLibrary holds pre-embedded conversation phase patterns for fast detection.

func NewPhasePatternLibrary ¶

func NewPhasePatternLibrary() *PhasePatternLibrary

NewPhasePatternLibrary creates a pattern library. Call InitEmbeddings() after creation to pre-compute embeddings for all patterns.

func (*PhasePatternLibrary) DetectPhase ¶

func (lib *PhasePatternLibrary) DetectPhase(utteranceEmbedding []float32) (ConversationPhase, float64)

DetectPhase compares an utterance embedding against pattern clusters and returns the best-matching phase with a confidence score.

func (*PhasePatternLibrary) InitEmbeddings ¶

func (lib *PhasePatternLibrary) InitEmbeddings(ctx context.Context, embedFunc func(ctx context.Context, text string) ([]float32, error)) error

InitEmbeddings pre-computes embeddings for all patterns using the given embed function. Call this once at startup.

type PolyphonSession ¶

type PolyphonSession struct {
	SpaceId               string                    `json:"spaceId"`
	Platform              VoicePlatform             `json:"platform"`
	Humans                []SessionParticipant      `json:"humans"`
	Agents                []SessionAgent            `json:"agents"`
	Transcript            []TranscriptEntry         `json:"transcript"`
	ConsecutiveAgentTurns int                       `json:"consecutiveAgentTurns"`
	LastSpeakerId         string                    `json:"lastSpeakerId,omitempty"`
	LastSpeakerType       string                    `json:"lastSpeakerType,omitempty"` // "human" or "agent"
	LastAgentId           string                    `json:"lastAgentId,omitempty"`
	LastAddressedAgentId  string                    `json:"lastAddressedAgentId,omitempty"` // Conversational thread: who was last addressed by name
	LastAddressedAt       time.Time                 `json:"lastAddressedAt,omitempty"`      // When the thread was established
	HandoffChainDepth     int                       `json:"handoffChainDepth"`              // Hops in the current handoff chain; reset on non-handoff outcomes.
	AgentTurnCounts       map[string]int            `json:"agentTurnCounts"`
	StateMachine          *ConversationStateMachine `json:"-"`
	Prediction            *PredictiveState          `json:"prediction,omitempty"`
	PendingEvents         []map[string]any          `json:"pendingEvents,omitempty"` // Queue for notifications to surface
	CompactedSummary      string                    `json:"-"`                       // Zone 1: SI-generated summary of old messages
	LastCompactionAt      time.Time                 `json:"-"`                       // When compaction last ran
	CompactionIndex       int                       `json:"-"`                       // Transcript index up to which summary covers
	CreatedAt             time.Time                 `json:"createdAt"`
	LastActivityAt        time.Time                 `json:"lastActivityAt"`
	// contains filtered or unexported fields
}

PolyphonSession tracks the state of a multi-agent conversation in a space.

func NewSession ¶

func NewSession(spaceId string) *PolyphonSession

NewSession creates a new PolyphonSession for the given space.

func (*PolyphonSession) AddTranscript ¶

func (s *PolyphonSession) AddTranscript(entry TranscriptEntry)

AddTranscript appends a transcript entry and updates session state.

func (*PolyphonSession) AgentTurnsSinceHuman ¶

func (s *PolyphonSession) AgentTurnsSinceHuman() int

AgentTurnsSinceHuman returns how many consecutive agent turns have occurred since the last human spoke.

func (*PolyphonSession) GetAddressedAgent ¶

func (s *PolyphonSession) GetAddressedAgent() (string, time.Time)

GetAddressedAgent returns the currently addressed agent ID and when it was set. Returns empty string and zero time if no active thread.

func (*PolyphonSession) GetHandoffChainDepth ¶

func (s *PolyphonSession) GetHandoffChainDepth() int

GetHandoffChainDepth returns the current handoff chain depth. 0 means the most recent turn was not a handoff (specialist answered or general- assistant fallback with no prior handoff); >0 means we are inside a chain of handoffs of that length.

func (*PolyphonSession) GetPrediction ¶

func (s *PolyphonSession) GetPrediction() *PredictiveState

GetPrediction returns the current predictive state, or nil if unavailable.

func (*PolyphonSession) LastAgentSpokeAt ¶

func (s *PolyphonSession) LastAgentSpokeAt(agentId string) time.Time

LastAgentSpokeAt returns the most recent time the given agent spoke, or zero time if the agent hasn't spoken.

func (*PolyphonSession) RecentTranscript ¶

func (s *PolyphonSession) RecentTranscript(n int) []TranscriptEntry

RecentTranscript returns the last n entries from the transcript.

func (*PolyphonSession) RecordHandoffOutcome ¶

func (s *PolyphonSession) RecordHandoffOutcome(handoff bool)

RecordHandoffOutcome updates chain depth based on the current turn's handoff flag: increments by 1 when handoff=true, resets to 0 when false. Called by the router after each LLM routing decision.

func (*PolyphonSession) SetAddressedAgent ¶

func (s *PolyphonSession) SetAddressedAgent(agentId string)

SetAddressedAgent updates the conversational thread to the given agent.

func (*PolyphonSession) SetPrediction ¶

func (s *PolyphonSession) SetPrediction(state *PredictiveState)

SetPrediction updates the session's predictive state.

type PredictiveAnalyzer ¶

type PredictiveAnalyzer interface {
	// Analyze processes recent conversation state and returns a predictive
	// snapshot. Called asynchronously -- must not block the main flow.
	Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
}

PredictiveAnalyzer runs async analysis on the conversation to anticipate what might happen next. The score engine uses this to make better routing decisions and take proactive actions.

FUTURE: Initial implementation will use a cheap SI model call to classify conversation phase and anticipate topics. More sophisticated prediction (topic modeling, intent classification, conversation trajectory) can be layered on later.

type PredictiveState ¶

type PredictiveState struct {
	// AnticipatedTopics are subjects the conversation is likely to explore next.
	AnticipatedTopics []string `json:"anticipatedTopics,omitempty"`
	// LikelyNextAgent is the agent most likely to be needed next.
	LikelyNextAgent string `json:"likelyNextAgent,omitempty"`
	// Confidence in the prediction (0.0-1.0).
	Confidence float64 `json:"confidence,omitempty"`
	// ConversationPhase classifies the current stage of the conversation.
	// Values: "greeting", "exploration", "task", "decision", "winding-down"
	ConversationPhase string `json:"conversationPhase,omitempty"`
	// SuggestedAction is a proactive action the score engine could take.
	SuggestedAction string `json:"suggestedAction,omitempty"`
	// LastUpdated is when this prediction was last computed.
	LastUpdated time.Time `json:"lastUpdated,omitempty"`
}

PredictiveState represents the score engine's anticipation of where the conversation is heading. Updated asynchronously by the PredictiveAnalyzer.

FUTURE: The actual prediction logic (SI-powered conversation phase classification, topic anticipation, intent forecasting) will be added in a follow-up implementation. This type and the PredictiveAnalyzer interface establish the extension points.

type RoomInfo ¶

type RoomInfo struct {
	RoomName string            `json:"roomName"`
	RoomSID  string            `json:"roomSID"`
	SpaceId  string            `json:"spaceId"`
	Active   bool              `json:"active"`
	Humans   []RoomParticipant `json:"humans"`
	Agents   []RoomParticipant `json:"agents"`
}

RoomInfo describes the current state of an audio room.

type RoomParticipant ¶

type RoomParticipant struct {
	ID         string `json:"id"`
	Name       string `json:"name"`
	Type       string `json:"type"` // "human" or "agent"
	IsSpeaking bool   `json:"isSpeaking"`
}

RoomParticipant describes a participant in a room.

type RoomProvider ¶

type RoomProvider interface {
	// CreateRoom creates a new audio room for a space.
	CreateRoom(ctx context.Context, spaceId string, agents []AgentRoomConfig) (*RoomInfo, error)

	// DestroyRoom tears down a room and all its participants.
	DestroyRoom(ctx context.Context, spaceId string) error

	// GenerateToken creates an access token for a participant to join a room.
	GenerateToken(ctx context.Context, spaceId, participantId, displayName string) (*RoomToken, error)

	// GetRoomInfo returns the current state of a room.
	GetRoomInfo(ctx context.Context, spaceId string) (*RoomInfo, error)
}

RoomProvider abstracts the audio room management (LiveKit or equivalent). The Bridge Agent implements this interface to manage multi-party audio rooms.

type RoomToken ¶

type RoomToken struct {
	Token      string `json:"token"`
	RoomName   string `json:"roomName"`
	LiveKitURL string `json:"livekitUrl"`
	ExpiresAt  int64  `json:"expiresAt"` // Unix timestamp
}

RoomToken is an access token for joining a room.

type ScoreDecision ¶

type ScoreDecision struct {
	SpaceId           string            `json:"spaceId"`
	UtteranceId       string            `json:"utteranceId"`
	Winner            *AgentScore       `json:"winner,omitempty"`
	Scores            []AgentScore      `json:"scores"`
	Action            string            `json:"action"`     // "respond", "continue", "silence"
	Confidence        string            `json:"confidence"` // "high", "medium", "low" -- determines whether SI router is invoked
	Continuation      bool              `json:"continuation"`
	ConversationPhase ConversationPhase `json:"conversationPhase,omitempty"`
	ResponseDelay     time.Duration     `json:"-"`
	Timestamp         time.Time         `json:"timestamp"`
}

ScoreDecision is the output of the score engine for a given utterance.

func (*ScoreDecision) HasWinner ¶

func (d *ScoreDecision) HasWinner() bool

HasWinner returns true if a winning agent was selected.

type ScoreEngine ¶

type ScoreEngine struct {
	// contains filtered or unexported fields
}

ScoreEngine is the Polyphon multi-agent scoring engine. It receives utterances, It receives utterances, scores all agents, applies turn-taking policy, and returns a decision about which agent should respond.

func NewScoreEngine ¶

func NewScoreEngine(logger *slog.Logger) *ScoreEngine

NewScoreEngine creates a new ScoreEngine with default configuration.

func NewScoreEngineWithConfig ¶

func NewScoreEngineWithConfig(weights ScoringWeights, policyConfig TurnPolicyConfig, logger *slog.Logger) *ScoreEngine

NewScoreEngineWithConfig creates a ScoreEngine with custom scoring weights and turn policy.

func (*ScoreEngine) Policy ¶

func (c *ScoreEngine) Policy() *TurnPolicy

Policy returns the TurnPolicy for external inspection.

func (*ScoreEngine) ProcessUtterance ¶

func (c *ScoreEngine) ProcessUtterance(_ context.Context, utterance Utterance, candidates []AgentCandidate) *ScoreDecision

ProcessUtterance evaluates a human utterance and decides which agent (if any) should respond. This is the main entry point for the ScoreEngine.

func (*ScoreEngine) RecordAgentResponse ¶

func (c *ScoreEngine) RecordAgentResponse(spaceId string, agentId, agentName, responseText string, candidates []AgentCandidate) (bool, *AgentScore)

RecordAgentResponse records that an agent produced a response. Call this after the agent's response has been generated and is being delivered. Returns a continuation decision: should another agent follow up?

func (*ScoreEngine) Scorer ¶

func (c *ScoreEngine) Scorer() *Scorer

Scorer returns the Scorer for external inspection.

func (*ScoreEngine) Sessions ¶

func (c *ScoreEngine) Sessions() *SessionManager

Sessions returns the SessionManager for direct session operations.

type Scorer ¶

type Scorer struct {
	Weights ScoringWeights
	Policy  TurnPolicyConfig
}

Scorer evaluates all agent candidates and returns scored results.

func NewScorer ¶

func NewScorer(weights ScoringWeights, policy TurnPolicyConfig) *Scorer

NewScorer creates a Scorer with the given weights and turn policy.

func (*Scorer) ScoreAll ¶

func (s *Scorer) ScoreAll(candidates []AgentCandidate, utterance Utterance, session *PolyphonSession) []AgentScore

ScoreAll evaluates every agent candidate against the utterance and session state. Returns a slice of AgentScore sorted by TotalScore descending.

type ScoringFactor ¶

type ScoringFactor struct {
	Name   string  `json:"name"`
	Weight float64 `json:"weight"`
	Value  float64 `json:"value"`  // Raw value (0.0–1.0)
	Score  float64 `json:"score"`  // Weight × Value
	Detail string  `json:"detail"` // Human-readable explanation
}

ScoringFactor represents a single factor contributing to an agent's score.

type ScoringWeights ¶

type ScoringWeights struct {
	DirectAddress         float64 `json:"directAddress"`
	DomainRelevance       float64 `json:"domainRelevance"`
	ConversationalThread  float64 `json:"conversationalThread"`
	ConversationRecency   float64 `json:"conversationRecency"`
	QuestionDetection     float64 `json:"questionDetection"`
	ContinuationRelevance float64 `json:"continuationRelevance"`
}

ScoringWeights defines the weight (0–100) for each scoring factor. The sum of all weights should equal 100.

func DefaultScoringWeights ¶

func DefaultScoringWeights() ScoringWeights

DefaultScoringWeights returns the default scoring factor weights.

The cognitionRouting SI prompt is the primary turn-taking decision; the heuristic scorer produces signals the router reads (per-agent fit, intent, mentions) rather than itself picking the winner. That's why ConversationalThread is zeroed: the router already sees the previous responder and recent transcript and reasons about continuity directly; adding a silent +15 per turn caused the same agent to dominate once they got the first word in. Direct address and domain relevance still carry weight because they're observable signals, not preferences.

type SessionAgent ¶

type SessionAgent struct {
	AgentId       string    `json:"agentId"`
	ParticipantId string    `json:"participantId"`
	Name          string    `json:"name"`
	JoinedAt      time.Time `json:"joinedAt"`
}

SessionAgent represents an SI agent participant in a Polyphon session.

type SessionManager ¶

type SessionManager struct {
	// contains filtered or unexported fields
}

SessionManager manages Polyphon sessions across spaces. Sessions are kept in memory (not persisted to database) because they represent ephemeral real-time voice state. If memQL restarts, active sessions are recreated when participants rejoin.

func NewSessionManager ¶

func NewSessionManager(logger *slog.Logger) *SessionManager

NewSessionManager creates a new SessionManager.

func (*SessionManager) ActiveSessions ¶

func (m *SessionManager) ActiveSessions() int

ActiveSessions returns the number of active Polyphon sessions.

func (*SessionManager) AddAgent ¶

func (m *SessionManager) AddAgent(spaceId string, agentId, participantId, name string) error

AddAgent registers an SI agent in the session.

func (*SessionManager) AddHuman ¶

func (m *SessionManager) AddHuman(spaceId string, participantId, displayName string) error

AddHuman registers a human participant in the session.

func (*SessionManager) Get ¶

func (m *SessionManager) Get(spaceId string) *PolyphonSession

Get returns the session for the given space, or nil if not found.

func (*SessionManager) GetOrCreate ¶

func (m *SessionManager) GetOrCreate(spaceId string) *PolyphonSession

GetOrCreate returns the session for the given space, creating one if needed.

func (*SessionManager) HasHeartbeat ¶

func (m *SessionManager) HasHeartbeat(spaceId string) bool

HasHeartbeat returns true if a heartbeat is running for the given space.

func (*SessionManager) HasHumans ¶

func (m *SessionManager) HasHumans(spaceId string) bool

HasHumans returns true if the session for the given space has at least one human.

func (*SessionManager) HasPrediction ¶

func (m *SessionManager) HasPrediction(spaceId string) bool

HasPrediction returns true if a prediction goroutine is running for the given space.

func (*SessionManager) RecordUtterance ¶

func (m *SessionManager) RecordUtterance(spaceId string, entry TranscriptEntry)

RecordUtterance adds a transcript entry to the session and trims old entries if the transcript exceeds the maximum length.

func (*SessionManager) Remove ¶

func (m *SessionManager) Remove(spaceId string) *PolyphonSession

Remove removes and returns the session for the given space. Also stops the heartbeat goroutine if running.

func (*SessionManager) RemoveAgent ¶

func (m *SessionManager) RemoveAgent(spaceId, agentId string)

RemoveAgent removes an SI agent from the session.

func (*SessionManager) RemoveHuman ¶

func (m *SessionManager) RemoveHuman(spaceId, participantId string)

RemoveHuman removes a human participant from the session.

func (*SessionManager) StartHeartbeat ¶

func (m *SessionManager) StartHeartbeat(
	spaceId string,
	interval time.Duration,
	evaluator HeartbeatEvaluator,
	candidates []AgentCandidate,
	actionHandler func(HeartbeatAction),
)

StartHeartbeat begins a background goroutine that ticks every interval and evaluates whether the score engine should take proactive action. The evaluator checks silence duration, pending events, and predictive state. The actionHandler callback is invoked when the evaluator decides to act.

Calling StartHeartbeat on a space that already has a heartbeat is a no-op. Call StopHeartbeat to stop the goroutine.

func (*SessionManager) StartPrediction ¶

func (m *SessionManager) StartPrediction(spaceId string, interval time.Duration, analyzer PredictiveAnalyzer, candidates []AgentCandidate)

StartPrediction launches a background goroutine that runs the predictive analyzer at the given interval for the specified space. Only runs when conversation is active.

func (*SessionManager) StopHeartbeat ¶

func (m *SessionManager) StopHeartbeat(spaceId string)

StopHeartbeat stops the heartbeat goroutine for the given space.

func (*SessionManager) StopPrediction ¶

func (m *SessionManager) StopPrediction(spaceId string)

StopPrediction stops the prediction goroutine for a space.

type SessionParticipant ¶

type SessionParticipant struct {
	ParticipantId string    `json:"participantId"`
	DisplayName   string    `json:"displayName"`
	JoinedAt      time.Time `json:"joinedAt"`
}

SessionParticipant represents a human participant in a Polyphon session.

type TTSChunk ¶

type TTSChunk struct {
	Audio    []byte `json:"audio"`    // PCM16 or WAV audio data
	Sequence int    `json:"sequence"` // Chunk sequence number
	Done     bool   `json:"done"`     // Is this the last chunk?
}

TTSChunk is a chunk of synthesized audio from the TTS provider.

type TTSConfig ¶

type TTSConfig struct {
	// Text to synthesize.
	Text string `json:"text"`

	// VoiceModel is the voice identity to use (provider-specific).
	// For OpenAI: "alloy", "nova", "coral", etc.
	VoiceModel string `json:"voiceModel"`

	// SampleRate in Hz for output audio (default: 16000).
	SampleRate int `json:"sampleRate"`

	// Speed controls speaking rate (1.0 = normal, 0.5 = slow, 2.0 = fast).
	Speed float64 `json:"speed"`

	// Language is a BCP-47 language code (e.g., "en-US").
	Language string `json:"language"`
}

TTSConfig configures a TTS synthesis request.

func DefaultTTSConfig ¶

func DefaultTTSConfig(text, voiceModel string) TTSConfig

DefaultTTSConfig returns sensible defaults for TTS configuration.

type TTSProvider ¶

type TTSProvider interface {
	// Synthesize converts text to speech audio, returning a reader of audio data.
	// The audio format depends on the provider (typically PCM16 or WAV).
	Synthesize(ctx context.Context, config TTSConfig) (io.ReadCloser, error)

	// SynthesizeStream converts text to speech with streaming output.
	// Audio chunks arrive on the returned channel as they're generated.
	SynthesizeStream(ctx context.Context, config TTSConfig) (TTSStream, error)

	// AvailableVoices returns the list of voice models this provider supports.
	AvailableVoices(ctx context.Context) ([]VoiceInfo, error)
}

TTSProvider abstracts text-to-speech synthesis. The implementation is OpenAI TTS in Stage 1; Stage 2 of the Deepgram migration adds Aura-2 as the new default.

type TTSStream ¶

type TTSStream interface {
	// Chunks returns a channel of audio chunks as they're generated.
	Chunks() <-chan TTSChunk

	// Close terminates the stream.
	Close() error
}

TTSStream represents a streaming TTS session.

type TokenBudget ¶

type TokenBudget struct {
	ContextWindow int // Total context window (e.g., 128000)
	MaxOutput     int // Reserved for response (e.g., 4096)
	SystemPrompt  int // Estimated system prompt tokens (e.g., 2000)
	SafetyMargin  int // Buffer (e.g., 1000)
	Available     int // Computed: ContextWindow - MaxOutput - SystemPrompt - SafetyMargin
}

TokenBudget represents the available token space for conversation context.

func CalculateTokenBudget ¶

func CalculateTokenBudget(contextWindow, maxOutput, systemPromptEst int) TokenBudget

CalculateTokenBudget computes the available token budget for conversation history.

type TranscriptEntry ¶

type TranscriptEntry struct {
	ID          string    `json:"id"`
	SpaceId     string    `json:"spaceId"`
	SpeakerId   string    `json:"speakerId"`
	SpeakerName string    `json:"speakerName"`
	SpeakerType string    `json:"speakerType"` // "human" or "agent"
	Text        string    `json:"text"`
	Timestamp   time.Time `json:"timestamp"`
	UtteranceId string    `json:"utteranceId,omitempty"`
}

TranscriptEntry is a single entry in the shared conversation transcript.

type TurnPolicy ¶

type TurnPolicy struct {
	Config TurnPolicyConfig
}

TurnPolicy enforces turn-taking rules for multi-agent conversations.

func NewTurnPolicy ¶

func NewTurnPolicy(cfg TurnPolicyConfig) *TurnPolicy

NewTurnPolicy creates a TurnPolicy with the given configuration.

func (*TurnPolicy) CanAgentRespond ¶

func (p *TurnPolicy) CanAgentRespond(agentId string, score float64, session *PolyphonSession) bool

CanAgentRespond checks if a specific agent is allowed to respond given the current session state and policy constraints.

func (*TurnPolicy) EvaluateScores ¶

func (p *TurnPolicy) EvaluateScores(scores []AgentScore, session *PolyphonSession) []AgentScore

EvaluateScores takes the scored agents and the session state, and marks which agents should respond. It applies the response threshold and handles the speakWhen="always" case.

func (*TurnPolicy) ResponseDelay ¶

func (p *TurnPolicy) ResponseDelay() time.Duration

ResponseDelay returns a randomized "thinking" pause duration to make the agent response feel more natural.

func (*TurnPolicy) ShouldContinue ¶

func (p *TurnPolicy) ShouldContinue(scores []AgentScore, session *PolyphonSession) (bool, *AgentScore)

ShouldContinue determines whether another agent should speak after the current agent just finished. It checks the continuation threshold and the consecutive agent turn limit.

type TurnPolicyConfig ¶

type TurnPolicyConfig struct {
	// ResponseThreshold is the minimum score (0–100) for an agent to respond.
	ResponseThreshold float64 `json:"responseThreshold"`

	// ContinuationThreshold is the minimum score for a follow-up agent turn.
	ContinuationThreshold float64 `json:"continuationThreshold"`

	// MaxConsecutiveAgentTurns is the hard cap on agent turns before yielding to humans.
	MaxConsecutiveAgentTurns int `json:"maxConsecutiveAgentTurns"`

	// MinResponseDelayMs is the minimum "thinking" pause in milliseconds.
	MinResponseDelayMs int `json:"minResponseDelayMs"`

	// MaxResponseDelayMs is the maximum "thinking" pause in milliseconds.
	MaxResponseDelayMs int `json:"maxResponseDelayMs"`

	// RecencyDecayFactor controls how quickly recency score decreases (0.0–1.0).
	// Higher values = faster decay = less likely to repeat the same agent.
	RecencyDecayFactor float64 `json:"recencyDecayFactor"`

	// TextThreadTimeout is how long a conversational thread stays active for text input.
	TextThreadTimeout time.Duration `json:"-"`
	// VoiceThreadTimeout is how long a conversational thread stays active for voice input.
	VoiceThreadTimeout time.Duration `json:"-"`
}

TurnPolicyConfig holds configurable thresholds for turn-taking behavior.

func DefaultTurnPolicy ¶

func DefaultTurnPolicy() TurnPolicyConfig

DefaultTurnPolicy returns a TurnPolicyConfig with sensible defaults. ResponseThreshold of 30 ensures agents with strong domain relevance can respond without requiring direct addressing.

type Utterance ¶

type Utterance struct {
	ID            string            `json:"id"`
	SpaceId       string            `json:"spaceId"`
	ParticipantId string            `json:"participantId"`
	SpeakerName   string            `json:"speakerName,omitempty"`
	Text          string            `json:"text"`
	IsFinal       bool              `json:"isFinal"`
	IsVoice       bool              `json:"isVoice,omitempty"`
	Timestamp     time.Time         `json:"timestamp"`
	Mentions      []Mention         `json:"mentions,omitempty"`
	Intent        *IntentResult     `json:"intent,omitempty"`
	Source        map[string]string `json:"source,omitempty"`
	Embedding     []float32         `json:"-"` // Pre-computed embedding for vector domain scoring
}

Utterance represents a transcribed speech segment from a human participant.

type VoiceInfo ¶

type VoiceInfo struct {
	ID       string `json:"id"`
	Name     string `json:"name"`
	Language string `json:"language"`
	Gender   string `json:"gender"` // "male", "female", "neutral"
}

VoiceInfo describes an available TTS voice.

type VoicePlatform ¶

type VoicePlatform string

VoicePlatform identifies which voice provider flavor is active.

const (
	// PlatformOpenAI uses OpenAI Realtime transcription (ASR) + OpenAI TTS API.
	// Current default; replaced by Deepgram (Nova-3 + Aura-2) in Stage 2 of
	// the Deepgram migration plan.
	PlatformOpenAI VoicePlatform = "openai"
)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL