Documentation
¶
Overview ¶
Package polyphon implements the Polyphon multi-agent voice conversation orchestration system. It scores multiple SI agents, manages turn-taking, and decides which agent should respond to human utterances.
Polyphon (from Greek "polyphōnía" — many voices) enables up to 3 SI agents and 5 humans to participate in natural real-time voice conversations.
Index ¶
- Constants
- Variables
- func EstimateTokens(text string) int
- func ExpandDomains(domains []string) []string
- func LookupContextWindow(model string) int
- func SplitSentences(text string) []string
- type ASRConfig
- type ASRProvider
- type ASRResult
- type ASRResultKind
- type ASRStream
- type AgentCandidate
- type AgentRoomConfig
- type AgentScore
- type CompactedContext
- type CompactionLevel
- type Config
- type ContextBuilder
- func (cb *ContextBuilder) BuildForAgent(session *PolyphonSession, agentId string) *CompactedContext
- func (cb *ContextBuilder) BuildForAgentSemantic(session *PolyphonSession, agentId string, utteranceEmbedding []float32, ...) *CompactedContext
- func (cb *ContextBuilder) BuildForAgentWithBudget(session *PolyphonSession, agentId string, budget TokenBudget) *CompactedContext
- func (cb *ContextBuilder) EntriesForCompaction(session *PolyphonSession) []TranscriptEntry
- func (cb *ContextBuilder) TriggerCompaction(ctx context.Context, session *PolyphonSession, ...)
- func (cb *ContextBuilder) UpdateSummary(session *PolyphonSession, summary string, upToIndex int)
- type ConversationPhase
- type ConversationStateMachine
- type DefaultHeartbeatEvaluator
- type ExternalPredictiveAnalyzer
- type HeartbeatAction
- type HeartbeatEvaluator
- type IntentResult
- type IntentType
- type LocalRoomProvider
- func (p *LocalRoomProvider) CreateRoom(_ context.Context, _ string, _ []AgentRoomConfig) (*RoomInfo, error)
- func (p *LocalRoomProvider) DestroyRoom(_ context.Context, _ string) error
- func (p *LocalRoomProvider) GenerateToken(_ context.Context, spaceId, participantId, displayName string) (*RoomToken, error)
- func (p *LocalRoomProvider) GetRoomInfo(_ context.Context, _ string) (*RoomInfo, error)
- type Mention
- type MentionRole
- type ModelPredictiveAnalyzer
- type NoOpPredictiveAnalyzer
- type ParticipantRef
- type PhasePatternLibrary
- type PolyphonSession
- func (s *PolyphonSession) AddTranscript(entry TranscriptEntry)
- func (s *PolyphonSession) AgentTurnsSinceHuman() int
- func (s *PolyphonSession) GetAddressedAgent() (string, time.Time)
- func (s *PolyphonSession) GetHandoffChainDepth() int
- func (s *PolyphonSession) GetPrediction() *PredictiveState
- func (s *PolyphonSession) LastAgentSpokeAt(agentId string) time.Time
- func (s *PolyphonSession) RecentTranscript(n int) []TranscriptEntry
- func (s *PolyphonSession) RecordHandoffOutcome(handoff bool)
- func (s *PolyphonSession) SetAddressedAgent(agentId string)
- func (s *PolyphonSession) SetPrediction(state *PredictiveState)
- type PredictiveAnalyzer
- type PredictiveState
- type RoomInfo
- type RoomParticipant
- type RoomProvider
- type RoomToken
- type ScoreDecision
- type ScoreEngine
- func (c *ScoreEngine) Policy() *TurnPolicy
- func (c *ScoreEngine) ProcessUtterance(_ context.Context, utterance Utterance, candidates []AgentCandidate) *ScoreDecision
- func (c *ScoreEngine) RecordAgentResponse(spaceId string, agentId, agentName, responseText string, ...) (bool, *AgentScore)
- func (c *ScoreEngine) Scorer() *Scorer
- func (c *ScoreEngine) Sessions() *SessionManager
- type Scorer
- type ScoringFactor
- type ScoringWeights
- type SessionAgent
- type SessionManager
- func (m *SessionManager) ActiveSessions() int
- func (m *SessionManager) AddAgent(spaceId string, agentId, participantId, name string) error
- func (m *SessionManager) AddHuman(spaceId string, participantId, displayName string) error
- func (m *SessionManager) Get(spaceId string) *PolyphonSession
- func (m *SessionManager) GetOrCreate(spaceId string) *PolyphonSession
- func (m *SessionManager) HasHeartbeat(spaceId string) bool
- func (m *SessionManager) HasHumans(spaceId string) bool
- func (m *SessionManager) HasPrediction(spaceId string) bool
- func (m *SessionManager) RecordUtterance(spaceId string, entry TranscriptEntry)
- func (m *SessionManager) Remove(spaceId string) *PolyphonSession
- func (m *SessionManager) RemoveAgent(spaceId, agentId string)
- func (m *SessionManager) RemoveHuman(spaceId, participantId string)
- func (m *SessionManager) StartHeartbeat(spaceId string, interval time.Duration, evaluator HeartbeatEvaluator, ...)
- func (m *SessionManager) StartPrediction(spaceId string, interval time.Duration, analyzer PredictiveAnalyzer, ...)
- func (m *SessionManager) StopHeartbeat(spaceId string)
- func (m *SessionManager) StopPrediction(spaceId string)
- type SessionParticipant
- type TTSChunk
- type TTSConfig
- type TTSProvider
- type TTSStream
- type TokenBudget
- type TranscriptEntry
- type TurnPolicy
- func (p *TurnPolicy) CanAgentRespond(agentId string, score float64, session *PolyphonSession) bool
- func (p *TurnPolicy) EvaluateScores(scores []AgentScore, session *PolyphonSession) []AgentScore
- func (p *TurnPolicy) ResponseDelay() time.Duration
- func (p *TurnPolicy) ShouldContinue(scores []AgentScore, session *PolyphonSession) (bool, *AgentScore)
- type TurnPolicyConfig
- type Utterance
- type VoiceInfo
- type VoicePlatform
Constants ¶
const MaxAgentsPerSpace = 3
MaxAgentsPerSpace is the maximum number of SI agents in a Polyphon session.
const MaxHumansPerSpace = 5
MaxHumansPerSpace is the maximum number of human participants in a Polyphon session.
Variables ¶
var DefaultContextWindows = map[string]int{
"gpt-5.4": 128000,
"gpt-5.4-pro": 256000,
"gpt-5.4-mini": 128000,
"gpt-5.4-nano": 32000,
"gpt-5.3-chat-latest": 128000,
"o4-mini": 200000,
"gpt-5.3-codex": 256000,
"gpt-5.1-codex-max": 400000,
"gpt-5-search-api": 128000,
"o4-mini-deep-research": 200000,
"o3-deep-research": 200000,
"claude-opus-4-6": 200000,
"claude-sonnet-4-6": 200000,
"claude-haiku-4-5": 200000,
}
DefaultContextWindows maps known model prefixes to their context window sizes. Used as fallback when contextWindow is not specified in the provider config. Keep this in sync with providers/v1/openai/*.memql -- the provider file's contextWindow param is the source of truth at runtime; this table is consulted only when a caller asks for the budget without a loaded provider config.
Functions ¶
func EstimateTokens ¶
EstimateTokens provides a fast token estimate (characters / 4). Within ~20% of actual token count for English text.
func ExpandDomains ¶
ExpandDomains returns the input domains plus common synonyms and aliases. Duplicates are removed. Original terms are always preserved.
func LookupContextWindow ¶
LookupContextWindow returns the context window for a model name. First checks the lookup table by exact match, then by prefix match. Falls back to 32000 for unknown models (conservative default).
func SplitSentences ¶
SplitSentences splits text into sentences at natural boundaries.
Rules:
- Split on . ! ? followed by whitespace or end-of-string
- Don't split abbreviations (Mr., Dr., e.g., i.e., etc.)
- Don't split decimal numbers (3.14, $5.99)
- Don't split inside quoted text
- Merge very short sentences (<20 chars) with the next sentence
- Flush remaining text as final sentence (even without punctuation)
Types ¶
type ASRConfig ¶
type ASRConfig struct {
// SampleRate in Hz (default: 16000).
SampleRate int `json:"sampleRate"`
// Language is a BCP-47 language code (e.g., "en-US", "es-MX").
Language string `json:"language"`
// EnableDiarization enables speaker identification in the transcript.
EnableDiarization bool `json:"enableDiarization"`
// MaxSpeakers is the maximum number of speakers for diarization.
MaxSpeakers int `json:"maxSpeakers"`
}
ASRConfig configures a streaming ASR session.
func DefaultASRConfig ¶
func DefaultASRConfig() ASRConfig
DefaultASRConfig returns sensible defaults for ASR configuration.
type ASRProvider ¶
type ASRProvider interface {
// StartStream begins a new streaming transcription session.
// Audio chunks are sent via SendAudio, and transcription results
// arrive on the Results channel.
StartStream(ctx context.Context, config ASRConfig) (ASRStream, error)
}
ASRProvider abstracts speech-to-text transcription. The implementation is OpenAI Realtime in Stage 1; Stage 2 of the Deepgram migration adds Deepgram Nova-3 as the new default.
type ASRResult ¶
type ASRResult struct {
Text string `json:"text"`
IsFinal bool `json:"isFinal"`
Confidence float64 `json:"confidence"`
SpeakerId string `json:"speakerId,omitempty"` // Only if diarization enabled
// Kind discriminates the turn-structure role of this result.
// The zero value (ASRKindTranscript) preserves the historical
// Text/IsFinal behavior; ASRKindSpeechStarted carries an onset
// signal with no text. Additive for the turn-taking machine (#455);
// transcript-only consumers ignore it.
Kind ASRResultKind `json:"kind,omitempty"`
}
ASRResult is a transcription result from the ASR provider.
type ASRResultKind ¶
type ASRResultKind int
ASRResultKind discriminates the turn-structure role of an ASRResult. It is an additive, backward-compatible enrichment of the stream contract (see docs/voice/452-turntaking-orchestration-go.md, step 1): the zero value (ASRKindTranscript) preserves the historical interim/final-via-IsFinal behavior, so existing consumers that only read Text/IsFinal are unaffected. The Go turn-taking machine (#455) additionally reads Kind to drive barge-in onset off ASRKindSpeechStarted.
const ( // ASRKindTranscript is the default kind: a transcript update whose // finality is carried by IsFinal (interim when false, committed // end-of-utterance when true). This is the only kind pre-#455 // providers emitted, so it is the zero value for backward // compatibility. ASRKindTranscript ASRResultKind = iota // ASRKindSpeechStarted marks a voice-activity onset (Deepgram's // SpeechStarted VAD event). It carries no transcript text; it is the // signal the turn-taking machine uses to enter human-turn and to // raise a barge-in candidate while the assistant has the floor. ASRKindSpeechStarted )
type ASRStream ¶
type ASRStream interface {
// SendAudio sends a chunk of audio data for transcription.
// Audio must be PCM16, 16kHz, mono.
SendAudio(audio []byte) error
// Results returns a channel of transcription results.
// Intermediate (non-final) results may arrive before the final transcript.
Results() <-chan ASRResult
// Close terminates the stream and releases resources.
Close() error
}
ASRStream represents an active streaming transcription session.
type AgentCandidate ¶
type AgentCandidate struct {
ID string `json:"id"`
ParticipantId string `json:"participantId"`
Name string `json:"name"`
Description string `json:"description,omitempty"`
Personality string `json:"personality,omitempty"`
Role string `json:"role,omitempty"` // e.g. "assistant", "accounting-finance"
Domains []string `json:"domains,omitempty"`
Keywords []string `json:"keywords,omitempty"`
SpeakWhen string `json:"speakWhen,omitempty"` // "asked", "relevant", "always"
InterruptStyle string `json:"interruptStyle,omitempty"` // "polite", "assertive", "passive"
ContinuationBehavior string `json:"continuationBehavior,omitempty"` // "add_context", "disagree", "summarize"
ProviderConfig map[string]interface{} `json:"providerConfig,omitempty"`
Capabilities map[string]interface{} `json:"capabilities,omitempty"`
ProfileEmbedding []float32 `json:"-"` // Pre-loaded from node_vectors for vector domain scoring
}
AgentCandidate represents an SI agent being evaluated by the score engine.
type AgentRoomConfig ¶
type AgentRoomConfig struct {
AgentId string `json:"agentId"`
Name string `json:"name"`
VoiceModel string `json:"voiceModel"`
}
AgentRoomConfig describes an SI agent to be added to a room.
type AgentScore ¶
type AgentScore struct {
AgentId string `json:"agentId"`
AgentName string `json:"agentName"`
TotalScore float64 `json:"totalScore"` // Weighted sum (0–100)
Factors []ScoringFactor `json:"factors"`
ShouldRespond bool `json:"shouldRespond"`
Reason string `json:"reason"`
ToolsNeeded bool `json:"toolsNeeded,omitempty"` // SI router signals the agent needs tools for this utterance
}
AgentScore is the scoring result for a single agent candidate.
type CompactedContext ¶
type CompactedContext struct {
Summary string // Zone 1
RecentMessages []TranscriptEntry // Zone 2
AgentMessages []TranscriptEntry // Zone 3
TotalTokensEst int
}
CompactedContext is an agent-scoped view of the conversation history. Instead of passing all messages to the AI, it provides 3 zones: Zone 1 (Summary): compressed old history Zone 2 (Recent): last N messages verbatim Zone 3 (Agent-Relevant): this agent's own exchanges
type CompactionLevel ¶
type CompactionLevel int
CompactionLevel determines how aggressively to compact based on budget usage.
const ( CompactionNormal CompactionLevel = iota // Budget < 70%: full context CompactionAggressive // 70-90%: reduced context CompactionEmergency // 90-100%: minimal context CompactionReset // Overflow: start fresh )
func DetermineCompactionLevel ¶
func DetermineCompactionLevel(estimatedTokens, availableBudget int) CompactionLevel
DetermineCompactionLevel returns the appropriate compaction level based on estimated token usage vs available budget.
type Config ¶
type Config struct {
// LiveKitURL is the LiveKit server WebSocket URL (used for server-side connections).
// Set via POLYPHON_LIVEKIT_URL environment variable.
LiveKitURL string
// LiveKitPublicURL is the browser-reachable LiveKit WebSocket URL.
// In Docker, the internal hostname (e.g. ws://livekit:7880) is not reachable
// from the browser, so this provides the external URL (e.g. ws://localhost:7880).
// Falls back to LiveKitURL if not set.
// Set via POLYPHON_LIVEKIT_PUBLIC_URL environment variable.
LiveKitPublicURL string
// LiveKitAPIKey is the LiveKit API key.
// Set via POLYPHON_LIVEKIT_API_KEY environment variable.
LiveKitAPIKey string
// LiveKitAPISecret is the LiveKit API secret.
// Set via POLYPHON_LIVEKIT_API_SECRET environment variable.
LiveKitAPISecret string
// Scoring weights override (optional).
Weights *ScoringWeights
// Turn policy override (optional).
Policy *TurnPolicyConfig
}
Config holds the Polyphon LiveKit transport configuration. The score engine is always active. This config only controls the LiveKit transport layer for multi-agent voice rooms.
func ConfigFromEnv ¶
func ConfigFromEnv() Config
ConfigFromEnv loads Polyphon configuration from environment variables.
func (Config) LiveKitConfigured ¶
LiveKitConfigured returns true when all LiveKit credentials are present. The transport layer (room tokens, voice rooms) is only available when LiveKit URL (must start with ws:// or wss://), API key, and API secret are all set.
type ContextBuilder ¶
type ContextBuilder struct {
// contains filtered or unexported fields
}
ContextBuilder constructs agent-scoped context views from session transcripts.
func NewContextBuilder ¶
func NewContextBuilder() *ContextBuilder
NewContextBuilder creates a context builder with defaults.
func (*ContextBuilder) BuildForAgent ¶
func (cb *ContextBuilder) BuildForAgent(session *PolyphonSession, agentId string) *CompactedContext
BuildForAgent constructs a compacted context for a specific agent.
func (*ContextBuilder) BuildForAgentSemantic ¶
func (cb *ContextBuilder) BuildForAgentSemantic( session *PolyphonSession, agentId string, utteranceEmbedding []float32, agentProfileEmbedding []float32, findRelevant func(combinedEmbedding []float32, limit int) ([]TranscriptEntry, error), ) *CompactedContext
BuildForAgentSemantic constructs a context where Zone 3 uses semantic similarity instead of speaker-ID filtering. Falls back to BuildForAgent when embeddings are unavailable.
The findRelevant function queries node_vectors for utterances similar to the combined agent profile + current utterance embedding.
func (*ContextBuilder) BuildForAgentWithBudget ¶
func (cb *ContextBuilder) BuildForAgentWithBudget(session *PolyphonSession, agentId string, budget TokenBudget) *CompactedContext
BuildForAgentWithBudget constructs a compacted context that fits within the given token budget. Uses 4-level compaction escalation.
func (*ContextBuilder) EntriesForCompaction ¶
func (cb *ContextBuilder) EntriesForCompaction(session *PolyphonSession) []TranscriptEntry
EntriesForCompaction returns transcript entries that have not been summarized yet and are outside the recent window.
func (*ContextBuilder) TriggerCompaction ¶
func (cb *ContextBuilder) TriggerCompaction( ctx context.Context, session *PolyphonSession, invokeAI func(ctx context.Context, templateId string, data map[string]any) (any, error), )
TriggerCompaction runs AI summarization on old transcript entries. Debounced: skips if compaction ran within the last 30 seconds. Non-blocking: designed to be called from a fire-and-forget goroutine.
func (*ContextBuilder) UpdateSummary ¶
func (cb *ContextBuilder) UpdateSummary(session *PolyphonSession, summary string, upToIndex int)
UpdateSummary stores a new compacted summary in the session.
type ConversationPhase ¶
type ConversationPhase string
ConversationPhase represents the current phase of a group conversation.
const ( PhaseGreeting ConversationPhase = "greeting" PhaseExploration ConversationPhase = "exploration" PhaseTask ConversationPhase = "task" PhaseFollowUp ConversationPhase = "follow_up" PhaseWindingDown ConversationPhase = "winding_down" )
type ConversationStateMachine ¶
type ConversationStateMachine struct {
// contains filtered or unexported fields
}
ConversationStateMachine tracks the conversation phase and handles transitions.
func NewConversationStateMachine ¶
func NewConversationStateMachine() *ConversationStateMachine
NewConversationStateMachine creates a state machine starting in the greeting phase.
func (*ConversationStateMachine) LastTransition ¶
func (sm *ConversationStateMachine) LastTransition() time.Time
LastTransition returns when the last phase change occurred.
func (*ConversationStateMachine) Phase ¶
func (sm *ConversationStateMachine) Phase() ConversationPhase
Phase returns the current conversation phase.
func (*ConversationStateMachine) Transition ¶
func (sm *ConversationStateMachine) Transition(intent IntentType) ConversationPhase
Transition updates the conversation phase based on the detected intent. Returns the new phase after the transition.
type DefaultHeartbeatEvaluator ¶
type DefaultHeartbeatEvaluator struct {
// SilenceThreshold is how long silence must last before re-engagement.
// Default: 3 minutes.
SilenceThreshold time.Duration
// ClarifyingQuestionGrace is how long to wait after an agent asks a
// question before considering re-engagement (the human may be typing).
// Default: 30 seconds.
ClarifyingQuestionGrace time.Duration
}
DefaultHeartbeatEvaluator implements HeartbeatEvaluator with rule-based proactive behavior. It checks for silence, pending events, and predictive state to decide whether the score engine should take action.
func NewDefaultHeartbeatEvaluator ¶
func NewDefaultHeartbeatEvaluator() *DefaultHeartbeatEvaluator
NewDefaultHeartbeatEvaluator creates a heartbeat evaluator with sensible defaults.
func (*DefaultHeartbeatEvaluator) Evaluate ¶
func (e *DefaultHeartbeatEvaluator) Evaluate(session *PolyphonSession, candidates []AgentCandidate) *HeartbeatAction
Evaluate checks the session state and returns a HeartbeatAction. Returns nil if no action should be taken (idle).
type ExternalPredictiveAnalyzer ¶
type ExternalPredictiveAnalyzer struct {
// contains filtered or unexported fields
}
ExternalPredictiveAnalyzer calls an external HTTP endpoint for prediction. Activated when POLYPHON_PREDICTION_ENGINE_URL is set.
func NewExternalPredictiveAnalyzer ¶
func NewExternalPredictiveAnalyzer(endpoint string, fallback PredictiveAnalyzer) *ExternalPredictiveAnalyzer
NewExternalPredictiveAnalyzer creates an analyzer that calls an external service.
func (*ExternalPredictiveAnalyzer) Analyze ¶
func (a *ExternalPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
type HeartbeatAction ¶
type HeartbeatAction struct {
Type string `json:"type"` // "idle", "re-engage", "notify", "proactive"
AgentId string `json:"agentId"` // Which agent should act
AgentName string `json:"agentName"`
Reason string `json:"reason"` // Why this action was chosen
Payload map[string]any `json:"payload,omitempty"` // Action-specific data
}
HeartbeatAction represents a proactive action the score engine can take during a heartbeat tick. The heartbeat runs continuously per active space, evaluating whether the score engine should intervene.
type HeartbeatEvaluator ¶
type HeartbeatEvaluator interface {
Evaluate(session *PolyphonSession, candidates []AgentCandidate) *HeartbeatAction
}
HeartbeatEvaluator evaluates whether the score engine should take proactive action during a heartbeat tick. Implementations can be swapped for different strategies (e.g., rule-based, SI-powered, hybrid).
type IntentResult ¶
type IntentResult struct {
Primary IntentType `json:"primary"`
Secondary IntentType `json:"secondary,omitempty"`
Confidence float64 `json:"confidence"`
}
IntentResult holds the classification output.
func ClassifyIntent ¶
func ClassifyIntent(text string, mentions []Mention, prevEntry *TranscriptEntry) *IntentResult
ClassifyIntent determines the intent of an utterance using rule-based heuristics. Sub-10ms, no model call. Upgradeable to model-based later.
type IntentType ¶
type IntentType string
IntentType classifies the purpose of a human utterance.
const ( IntentGreeting IntentType = "greeting" IntentFollowUp IntentType = "follow_up" IntentDomainQuestion IntentType = "domain_question" IntentCapabilityQuestion IntentType = "capability_question" IntentTaskRequest IntentType = "task_request" IntentDirectAddress IntentType = "direct_address" IntentAffirmation IntentType = "affirmation" IntentFarewell IntentType = "farewell" )
type LocalRoomProvider ¶
type LocalRoomProvider struct {
// contains filtered or unexported fields
}
LocalRoomProvider generates LiveKit JWT tokens directly in the memQL process. This allows browser participants to join LiveKit rooms without the Bridge Agent running. Room/agent management operations are not supported -- they require the Bridge Agent.
func NewLocalRoomProvider ¶
func NewLocalRoomProvider(cfg Config) *LocalRoomProvider
NewLocalRoomProvider creates a provider that mints LiveKit access tokens using the credentials in cfg. The caller should verify cfg.LiveKitConfigured() before constructing this provider.
func (*LocalRoomProvider) CreateRoom ¶
func (p *LocalRoomProvider) CreateRoom(_ context.Context, _ string, _ []AgentRoomConfig) (*RoomInfo, error)
CreateRoom is not supported without the Bridge Agent.
func (*LocalRoomProvider) DestroyRoom ¶
func (p *LocalRoomProvider) DestroyRoom(_ context.Context, _ string) error
DestroyRoom is not supported without the Bridge Agent.
func (*LocalRoomProvider) GenerateToken ¶
func (p *LocalRoomProvider) GenerateToken(_ context.Context, spaceId, participantId, displayName string) (*RoomToken, error)
GenerateToken creates a LiveKit JWT for a participant to join a room. Room name follows the convention polyphon-{spaceId}.
func (*LocalRoomProvider) GetRoomInfo ¶
GetRoomInfo is not supported without the Bridge Agent.
type Mention ¶
type Mention struct {
ParticipantId string `json:"participantId"`
Name string `json:"name"`
ParticipantType string `json:"participantType"`
Position string `json:"position"` // "start" or "mid"
Role MentionRole `json:"role"`
}
Mention represents a detected @ mention in an utterance.
func ParseMentions ¶
func ParseMentions(text string, participants []ParticipantRef) ([]Mention, string)
ParseMentions scans utterance text for @{name} tokens and matches them against the participant roster. Returns detected mentions and cleaned text with @ prefixes removed for downstream scoring. Mentions carry ParticipantType so the router can distinguish "addressed an agent" (that agent responds) from "addressed a human" (AI stays silent).
type MentionRole ¶
type MentionRole string
MentionRole indicates how a participant was mentioned in an utterance.
const ( MentionRoleAddressee MentionRole = "addressee" MentionRoleReference MentionRole = "reference" )
type ModelPredictiveAnalyzer ¶
type ModelPredictiveAnalyzer struct {
// contains filtered or unexported fields
}
ModelPredictiveAnalyzer uses a fast SI model to predict conversation trajectory. Runs asynchronously via the prediction goroutine (every 30s per active space).
func NewModelPredictiveAnalyzer ¶
func NewModelPredictiveAnalyzer( invokeSI func(ctx context.Context, templateId string, data map[string]any) (any, error), embedFunc func(ctx context.Context, text string) ([]float32, error), ) *ModelPredictiveAnalyzer
NewModelPredictiveAnalyzer creates an analyzer that uses SI for prediction. The embedFunc parameter is optional (pass nil to disable vector-based phase detection).
func (*ModelPredictiveAnalyzer) Analyze ¶
func (a *ModelPredictiveAnalyzer) Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
type NoOpPredictiveAnalyzer ¶
type NoOpPredictiveAnalyzer struct{}
NoOpPredictiveAnalyzer is a placeholder that returns no prediction. Used when no SI engine is available (tests, standalone mode).
func (*NoOpPredictiveAnalyzer) Analyze ¶
func (n *NoOpPredictiveAnalyzer) Analyze(_ context.Context, _ *PolyphonSession, _ []AgentCandidate) (*PredictiveState, error)
type ParticipantRef ¶
ParticipantRef is a minimal mention-lookup view of a participant. Both agents and humans fit into it so ParseMentions can detect @-mentions that target either. The handler assembles a combined slice from agent candidates plus the human roster before calling.
type PhasePatternLibrary ¶
type PhasePatternLibrary struct {
// contains filtered or unexported fields
}
PhasePatternLibrary holds pre-embedded conversation phase patterns for fast detection.
func NewPhasePatternLibrary ¶
func NewPhasePatternLibrary() *PhasePatternLibrary
NewPhasePatternLibrary creates a pattern library. Call InitEmbeddings() after creation to pre-compute embeddings for all patterns.
func (*PhasePatternLibrary) DetectPhase ¶
func (lib *PhasePatternLibrary) DetectPhase(utteranceEmbedding []float32) (ConversationPhase, float64)
DetectPhase compares an utterance embedding against pattern clusters and returns the best-matching phase with a confidence score.
func (*PhasePatternLibrary) InitEmbeddings ¶
func (lib *PhasePatternLibrary) InitEmbeddings(ctx context.Context, embedFunc func(ctx context.Context, text string) ([]float32, error)) error
InitEmbeddings pre-computes embeddings for all patterns using the given embed function. Call this once at startup.
type PolyphonSession ¶
type PolyphonSession struct {
SpaceId string `json:"spaceId"`
Platform VoicePlatform `json:"platform"`
Humans []SessionParticipant `json:"humans"`
Agents []SessionAgent `json:"agents"`
Transcript []TranscriptEntry `json:"transcript"`
ConsecutiveAgentTurns int `json:"consecutiveAgentTurns"`
LastSpeakerId string `json:"lastSpeakerId,omitempty"`
LastSpeakerType string `json:"lastSpeakerType,omitempty"` // "human" or "agent"
LastAgentId string `json:"lastAgentId,omitempty"`
LastAddressedAgentId string `json:"lastAddressedAgentId,omitempty"` // Conversational thread: who was last addressed by name
LastAddressedAt time.Time `json:"lastAddressedAt,omitempty"` // When the thread was established
HandoffChainDepth int `json:"handoffChainDepth"` // Hops in the current handoff chain; reset on non-handoff outcomes.
AgentTurnCounts map[string]int `json:"agentTurnCounts"`
StateMachine *ConversationStateMachine `json:"-"`
Prediction *PredictiveState `json:"prediction,omitempty"`
PendingEvents []map[string]any `json:"pendingEvents,omitempty"` // Queue for notifications to surface
CompactedSummary string `json:"-"` // Zone 1: SI-generated summary of old messages
LastCompactionAt time.Time `json:"-"` // When compaction last ran
CompactionIndex int `json:"-"` // Transcript index up to which summary covers
CreatedAt time.Time `json:"createdAt"`
LastActivityAt time.Time `json:"lastActivityAt"`
// contains filtered or unexported fields
}
PolyphonSession tracks the state of a multi-agent conversation in a space.
func NewSession ¶
func NewSession(spaceId string) *PolyphonSession
NewSession creates a new PolyphonSession for the given space.
func (*PolyphonSession) AddTranscript ¶
func (s *PolyphonSession) AddTranscript(entry TranscriptEntry)
AddTranscript appends a transcript entry and updates session state.
func (*PolyphonSession) AgentTurnsSinceHuman ¶
func (s *PolyphonSession) AgentTurnsSinceHuman() int
AgentTurnsSinceHuman returns how many consecutive agent turns have occurred since the last human spoke.
func (*PolyphonSession) GetAddressedAgent ¶
func (s *PolyphonSession) GetAddressedAgent() (string, time.Time)
GetAddressedAgent returns the currently addressed agent ID and when it was set. Returns empty string and zero time if no active thread.
func (*PolyphonSession) GetHandoffChainDepth ¶
func (s *PolyphonSession) GetHandoffChainDepth() int
GetHandoffChainDepth returns the current handoff chain depth. 0 means the most recent turn was not a handoff (specialist answered or general- assistant fallback with no prior handoff); >0 means we are inside a chain of handoffs of that length.
func (*PolyphonSession) GetPrediction ¶
func (s *PolyphonSession) GetPrediction() *PredictiveState
GetPrediction returns the current predictive state, or nil if unavailable.
func (*PolyphonSession) LastAgentSpokeAt ¶
func (s *PolyphonSession) LastAgentSpokeAt(agentId string) time.Time
LastAgentSpokeAt returns the most recent time the given agent spoke, or zero time if the agent hasn't spoken.
func (*PolyphonSession) RecentTranscript ¶
func (s *PolyphonSession) RecentTranscript(n int) []TranscriptEntry
RecentTranscript returns the last n entries from the transcript.
func (*PolyphonSession) RecordHandoffOutcome ¶
func (s *PolyphonSession) RecordHandoffOutcome(handoff bool)
RecordHandoffOutcome updates chain depth based on the current turn's handoff flag: increments by 1 when handoff=true, resets to 0 when false. Called by the router after each LLM routing decision.
func (*PolyphonSession) SetAddressedAgent ¶
func (s *PolyphonSession) SetAddressedAgent(agentId string)
SetAddressedAgent updates the conversational thread to the given agent.
func (*PolyphonSession) SetPrediction ¶
func (s *PolyphonSession) SetPrediction(state *PredictiveState)
SetPrediction updates the session's predictive state.
type PredictiveAnalyzer ¶
type PredictiveAnalyzer interface {
// Analyze processes recent conversation state and returns a predictive
// snapshot. Called asynchronously -- must not block the main flow.
Analyze(ctx context.Context, session *PolyphonSession, candidates []AgentCandidate) (*PredictiveState, error)
}
PredictiveAnalyzer runs async analysis on the conversation to anticipate what might happen next. The score engine uses this to make better routing decisions and take proactive actions.
FUTURE: Initial implementation will use a cheap SI model call to classify conversation phase and anticipate topics. More sophisticated prediction (topic modeling, intent classification, conversation trajectory) can be layered on later.
type PredictiveState ¶
type PredictiveState struct {
// AnticipatedTopics are subjects the conversation is likely to explore next.
AnticipatedTopics []string `json:"anticipatedTopics,omitempty"`
// LikelyNextAgent is the agent most likely to be needed next.
LikelyNextAgent string `json:"likelyNextAgent,omitempty"`
// Confidence in the prediction (0.0-1.0).
Confidence float64 `json:"confidence,omitempty"`
// ConversationPhase classifies the current stage of the conversation.
// Values: "greeting", "exploration", "task", "decision", "winding-down"
ConversationPhase string `json:"conversationPhase,omitempty"`
// SuggestedAction is a proactive action the score engine could take.
SuggestedAction string `json:"suggestedAction,omitempty"`
// LastUpdated is when this prediction was last computed.
LastUpdated time.Time `json:"lastUpdated,omitempty"`
}
PredictiveState represents the score engine's anticipation of where the conversation is heading. Updated asynchronously by the PredictiveAnalyzer.
FUTURE: The actual prediction logic (SI-powered conversation phase classification, topic anticipation, intent forecasting) will be added in a follow-up implementation. This type and the PredictiveAnalyzer interface establish the extension points.
type RoomInfo ¶
type RoomInfo struct {
RoomName string `json:"roomName"`
RoomSID string `json:"roomSID"`
SpaceId string `json:"spaceId"`
Active bool `json:"active"`
Humans []RoomParticipant `json:"humans"`
Agents []RoomParticipant `json:"agents"`
}
RoomInfo describes the current state of an audio room.
type RoomParticipant ¶
type RoomParticipant struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"` // "human" or "agent"
IsSpeaking bool `json:"isSpeaking"`
}
RoomParticipant describes a participant in a room.
type RoomProvider ¶
type RoomProvider interface {
// CreateRoom creates a new audio room for a space.
CreateRoom(ctx context.Context, spaceId string, agents []AgentRoomConfig) (*RoomInfo, error)
// DestroyRoom tears down a room and all its participants.
DestroyRoom(ctx context.Context, spaceId string) error
// GenerateToken creates an access token for a participant to join a room.
GenerateToken(ctx context.Context, spaceId, participantId, displayName string) (*RoomToken, error)
// GetRoomInfo returns the current state of a room.
GetRoomInfo(ctx context.Context, spaceId string) (*RoomInfo, error)
}
RoomProvider abstracts the audio room management (LiveKit or equivalent). The Bridge Agent implements this interface to manage multi-party audio rooms.
type RoomToken ¶
type RoomToken struct {
Token string `json:"token"`
RoomName string `json:"roomName"`
LiveKitURL string `json:"livekitUrl"`
ExpiresAt int64 `json:"expiresAt"` // Unix timestamp
}
RoomToken is an access token for joining a room.
type ScoreDecision ¶
type ScoreDecision struct {
SpaceId string `json:"spaceId"`
UtteranceId string `json:"utteranceId"`
Winner *AgentScore `json:"winner,omitempty"`
Scores []AgentScore `json:"scores"`
Action string `json:"action"` // "respond", "continue", "silence"
Confidence string `json:"confidence"` // "high", "medium", "low" -- determines whether SI router is invoked
Continuation bool `json:"continuation"`
ConversationPhase ConversationPhase `json:"conversationPhase,omitempty"`
ResponseDelay time.Duration `json:"-"`
Timestamp time.Time `json:"timestamp"`
}
ScoreDecision is the output of the score engine for a given utterance.
func (*ScoreDecision) HasWinner ¶
func (d *ScoreDecision) HasWinner() bool
HasWinner returns true if a winning agent was selected.
type ScoreEngine ¶
type ScoreEngine struct {
// contains filtered or unexported fields
}
ScoreEngine is the Polyphon multi-agent scoring engine. It receives utterances, It receives utterances, scores all agents, applies turn-taking policy, and returns a decision about which agent should respond.
func NewScoreEngine ¶
func NewScoreEngine(logger *slog.Logger) *ScoreEngine
NewScoreEngine creates a new ScoreEngine with default configuration.
func NewScoreEngineWithConfig ¶
func NewScoreEngineWithConfig(weights ScoringWeights, policyConfig TurnPolicyConfig, logger *slog.Logger) *ScoreEngine
NewScoreEngineWithConfig creates a ScoreEngine with custom scoring weights and turn policy.
func (*ScoreEngine) Policy ¶
func (c *ScoreEngine) Policy() *TurnPolicy
Policy returns the TurnPolicy for external inspection.
func (*ScoreEngine) ProcessUtterance ¶
func (c *ScoreEngine) ProcessUtterance(_ context.Context, utterance Utterance, candidates []AgentCandidate) *ScoreDecision
ProcessUtterance evaluates a human utterance and decides which agent (if any) should respond. This is the main entry point for the ScoreEngine.
func (*ScoreEngine) RecordAgentResponse ¶
func (c *ScoreEngine) RecordAgentResponse(spaceId string, agentId, agentName, responseText string, candidates []AgentCandidate) (bool, *AgentScore)
RecordAgentResponse records that an agent produced a response. Call this after the agent's response has been generated and is being delivered. Returns a continuation decision: should another agent follow up?
func (*ScoreEngine) Scorer ¶
func (c *ScoreEngine) Scorer() *Scorer
Scorer returns the Scorer for external inspection.
func (*ScoreEngine) Sessions ¶
func (c *ScoreEngine) Sessions() *SessionManager
Sessions returns the SessionManager for direct session operations.
type Scorer ¶
type Scorer struct {
Weights ScoringWeights
Policy TurnPolicyConfig
}
Scorer evaluates all agent candidates and returns scored results.
func NewScorer ¶
func NewScorer(weights ScoringWeights, policy TurnPolicyConfig) *Scorer
NewScorer creates a Scorer with the given weights and turn policy.
func (*Scorer) ScoreAll ¶
func (s *Scorer) ScoreAll(candidates []AgentCandidate, utterance Utterance, session *PolyphonSession) []AgentScore
ScoreAll evaluates every agent candidate against the utterance and session state. Returns a slice of AgentScore sorted by TotalScore descending.
type ScoringFactor ¶
type ScoringFactor struct {
Name string `json:"name"`
Weight float64 `json:"weight"`
Value float64 `json:"value"` // Raw value (0.0–1.0)
Score float64 `json:"score"` // Weight × Value
Detail string `json:"detail"` // Human-readable explanation
}
ScoringFactor represents a single factor contributing to an agent's score.
type ScoringWeights ¶
type ScoringWeights struct {
DirectAddress float64 `json:"directAddress"`
DomainRelevance float64 `json:"domainRelevance"`
ConversationalThread float64 `json:"conversationalThread"`
ConversationRecency float64 `json:"conversationRecency"`
QuestionDetection float64 `json:"questionDetection"`
ContinuationRelevance float64 `json:"continuationRelevance"`
}
ScoringWeights defines the weight (0–100) for each scoring factor. The sum of all weights should equal 100.
func DefaultScoringWeights ¶
func DefaultScoringWeights() ScoringWeights
DefaultScoringWeights returns the default scoring factor weights.
The cognitionRouting SI prompt is the primary turn-taking decision; the heuristic scorer produces signals the router reads (per-agent fit, intent, mentions) rather than itself picking the winner. That's why ConversationalThread is zeroed: the router already sees the previous responder and recent transcript and reasons about continuity directly; adding a silent +15 per turn caused the same agent to dominate once they got the first word in. Direct address and domain relevance still carry weight because they're observable signals, not preferences.
type SessionAgent ¶
type SessionAgent struct {
AgentId string `json:"agentId"`
ParticipantId string `json:"participantId"`
Name string `json:"name"`
JoinedAt time.Time `json:"joinedAt"`
}
SessionAgent represents an SI agent participant in a Polyphon session.
type SessionManager ¶
type SessionManager struct {
// contains filtered or unexported fields
}
SessionManager manages Polyphon sessions across spaces. Sessions are kept in memory (not persisted to database) because they represent ephemeral real-time voice state. If memQL restarts, active sessions are recreated when participants rejoin.
func NewSessionManager ¶
func NewSessionManager(logger *slog.Logger) *SessionManager
NewSessionManager creates a new SessionManager.
func (*SessionManager) ActiveSessions ¶
func (m *SessionManager) ActiveSessions() int
ActiveSessions returns the number of active Polyphon sessions.
func (*SessionManager) AddAgent ¶
func (m *SessionManager) AddAgent(spaceId string, agentId, participantId, name string) error
AddAgent registers an SI agent in the session.
func (*SessionManager) AddHuman ¶
func (m *SessionManager) AddHuman(spaceId string, participantId, displayName string) error
AddHuman registers a human participant in the session.
func (*SessionManager) Get ¶
func (m *SessionManager) Get(spaceId string) *PolyphonSession
Get returns the session for the given space, or nil if not found.
func (*SessionManager) GetOrCreate ¶
func (m *SessionManager) GetOrCreate(spaceId string) *PolyphonSession
GetOrCreate returns the session for the given space, creating one if needed.
func (*SessionManager) HasHeartbeat ¶
func (m *SessionManager) HasHeartbeat(spaceId string) bool
HasHeartbeat returns true if a heartbeat is running for the given space.
func (*SessionManager) HasHumans ¶
func (m *SessionManager) HasHumans(spaceId string) bool
HasHumans returns true if the session for the given space has at least one human.
func (*SessionManager) HasPrediction ¶
func (m *SessionManager) HasPrediction(spaceId string) bool
HasPrediction returns true if a prediction goroutine is running for the given space.
func (*SessionManager) RecordUtterance ¶
func (m *SessionManager) RecordUtterance(spaceId string, entry TranscriptEntry)
RecordUtterance adds a transcript entry to the session and trims old entries if the transcript exceeds the maximum length.
func (*SessionManager) Remove ¶
func (m *SessionManager) Remove(spaceId string) *PolyphonSession
Remove removes and returns the session for the given space. Also stops the heartbeat goroutine if running.
func (*SessionManager) RemoveAgent ¶
func (m *SessionManager) RemoveAgent(spaceId, agentId string)
RemoveAgent removes an SI agent from the session.
func (*SessionManager) RemoveHuman ¶
func (m *SessionManager) RemoveHuman(spaceId, participantId string)
RemoveHuman removes a human participant from the session.
func (*SessionManager) StartHeartbeat ¶
func (m *SessionManager) StartHeartbeat( spaceId string, interval time.Duration, evaluator HeartbeatEvaluator, candidates []AgentCandidate, actionHandler func(HeartbeatAction), )
StartHeartbeat begins a background goroutine that ticks every interval and evaluates whether the score engine should take proactive action. The evaluator checks silence duration, pending events, and predictive state. The actionHandler callback is invoked when the evaluator decides to act.
Calling StartHeartbeat on a space that already has a heartbeat is a no-op. Call StopHeartbeat to stop the goroutine.
func (*SessionManager) StartPrediction ¶
func (m *SessionManager) StartPrediction(spaceId string, interval time.Duration, analyzer PredictiveAnalyzer, candidates []AgentCandidate)
StartPrediction launches a background goroutine that runs the predictive analyzer at the given interval for the specified space. Only runs when conversation is active.
func (*SessionManager) StopHeartbeat ¶
func (m *SessionManager) StopHeartbeat(spaceId string)
StopHeartbeat stops the heartbeat goroutine for the given space.
func (*SessionManager) StopPrediction ¶
func (m *SessionManager) StopPrediction(spaceId string)
StopPrediction stops the prediction goroutine for a space.
type SessionParticipant ¶
type SessionParticipant struct {
ParticipantId string `json:"participantId"`
DisplayName string `json:"displayName"`
JoinedAt time.Time `json:"joinedAt"`
}
SessionParticipant represents a human participant in a Polyphon session.
type TTSChunk ¶
type TTSChunk struct {
Audio []byte `json:"audio"` // PCM16 or WAV audio data
Sequence int `json:"sequence"` // Chunk sequence number
Done bool `json:"done"` // Is this the last chunk?
}
TTSChunk is a chunk of synthesized audio from the TTS provider.
type TTSConfig ¶
type TTSConfig struct {
// Text to synthesize.
Text string `json:"text"`
// VoiceModel is the voice identity to use (provider-specific).
// For OpenAI: "alloy", "nova", "coral", etc.
VoiceModel string `json:"voiceModel"`
// SampleRate in Hz for output audio (default: 16000).
SampleRate int `json:"sampleRate"`
// Speed controls speaking rate (1.0 = normal, 0.5 = slow, 2.0 = fast).
Speed float64 `json:"speed"`
// Language is a BCP-47 language code (e.g., "en-US").
Language string `json:"language"`
}
TTSConfig configures a TTS synthesis request.
func DefaultTTSConfig ¶
DefaultTTSConfig returns sensible defaults for TTS configuration.
type TTSProvider ¶
type TTSProvider interface {
// Synthesize converts text to speech audio, returning a reader of audio data.
// The audio format depends on the provider (typically PCM16 or WAV).
Synthesize(ctx context.Context, config TTSConfig) (io.ReadCloser, error)
// SynthesizeStream converts text to speech with streaming output.
// Audio chunks arrive on the returned channel as they're generated.
SynthesizeStream(ctx context.Context, config TTSConfig) (TTSStream, error)
// AvailableVoices returns the list of voice models this provider supports.
AvailableVoices(ctx context.Context) ([]VoiceInfo, error)
}
TTSProvider abstracts text-to-speech synthesis. The implementation is OpenAI TTS in Stage 1; Stage 2 of the Deepgram migration adds Aura-2 as the new default.
type TTSStream ¶
type TTSStream interface {
// Chunks returns a channel of audio chunks as they're generated.
Chunks() <-chan TTSChunk
// Close terminates the stream.
Close() error
}
TTSStream represents a streaming TTS session.
type TokenBudget ¶
type TokenBudget struct {
ContextWindow int // Total context window (e.g., 128000)
MaxOutput int // Reserved for response (e.g., 4096)
SystemPrompt int // Estimated system prompt tokens (e.g., 2000)
SafetyMargin int // Buffer (e.g., 1000)
Available int // Computed: ContextWindow - MaxOutput - SystemPrompt - SafetyMargin
}
TokenBudget represents the available token space for conversation context.
func CalculateTokenBudget ¶
func CalculateTokenBudget(contextWindow, maxOutput, systemPromptEst int) TokenBudget
CalculateTokenBudget computes the available token budget for conversation history.
type TranscriptEntry ¶
type TranscriptEntry struct {
ID string `json:"id"`
SpaceId string `json:"spaceId"`
SpeakerId string `json:"speakerId"`
SpeakerName string `json:"speakerName"`
SpeakerType string `json:"speakerType"` // "human" or "agent"
Text string `json:"text"`
Timestamp time.Time `json:"timestamp"`
UtteranceId string `json:"utteranceId,omitempty"`
}
TranscriptEntry is a single entry in the shared conversation transcript.
type TurnPolicy ¶
type TurnPolicy struct {
Config TurnPolicyConfig
}
TurnPolicy enforces turn-taking rules for multi-agent conversations.
func NewTurnPolicy ¶
func NewTurnPolicy(cfg TurnPolicyConfig) *TurnPolicy
NewTurnPolicy creates a TurnPolicy with the given configuration.
func (*TurnPolicy) CanAgentRespond ¶
func (p *TurnPolicy) CanAgentRespond(agentId string, score float64, session *PolyphonSession) bool
CanAgentRespond checks if a specific agent is allowed to respond given the current session state and policy constraints.
func (*TurnPolicy) EvaluateScores ¶
func (p *TurnPolicy) EvaluateScores(scores []AgentScore, session *PolyphonSession) []AgentScore
EvaluateScores takes the scored agents and the session state, and marks which agents should respond. It applies the response threshold and handles the speakWhen="always" case.
func (*TurnPolicy) ResponseDelay ¶
func (p *TurnPolicy) ResponseDelay() time.Duration
ResponseDelay returns a randomized "thinking" pause duration to make the agent response feel more natural.
func (*TurnPolicy) ShouldContinue ¶
func (p *TurnPolicy) ShouldContinue(scores []AgentScore, session *PolyphonSession) (bool, *AgentScore)
ShouldContinue determines whether another agent should speak after the current agent just finished. It checks the continuation threshold and the consecutive agent turn limit.
type TurnPolicyConfig ¶
type TurnPolicyConfig struct {
// ResponseThreshold is the minimum score (0–100) for an agent to respond.
ResponseThreshold float64 `json:"responseThreshold"`
// ContinuationThreshold is the minimum score for a follow-up agent turn.
ContinuationThreshold float64 `json:"continuationThreshold"`
// MaxConsecutiveAgentTurns is the hard cap on agent turns before yielding to humans.
MaxConsecutiveAgentTurns int `json:"maxConsecutiveAgentTurns"`
// MinResponseDelayMs is the minimum "thinking" pause in milliseconds.
MinResponseDelayMs int `json:"minResponseDelayMs"`
// MaxResponseDelayMs is the maximum "thinking" pause in milliseconds.
MaxResponseDelayMs int `json:"maxResponseDelayMs"`
// RecencyDecayFactor controls how quickly recency score decreases (0.0–1.0).
// Higher values = faster decay = less likely to repeat the same agent.
RecencyDecayFactor float64 `json:"recencyDecayFactor"`
// TextThreadTimeout is how long a conversational thread stays active for text input.
TextThreadTimeout time.Duration `json:"-"`
// VoiceThreadTimeout is how long a conversational thread stays active for voice input.
VoiceThreadTimeout time.Duration `json:"-"`
}
TurnPolicyConfig holds configurable thresholds for turn-taking behavior.
func DefaultTurnPolicy ¶
func DefaultTurnPolicy() TurnPolicyConfig
DefaultTurnPolicy returns a TurnPolicyConfig with sensible defaults. ResponseThreshold of 30 ensures agents with strong domain relevance can respond without requiring direct addressing.
type Utterance ¶
type Utterance struct {
ID string `json:"id"`
SpaceId string `json:"spaceId"`
ParticipantId string `json:"participantId"`
SpeakerName string `json:"speakerName,omitempty"`
Text string `json:"text"`
IsFinal bool `json:"isFinal"`
IsVoice bool `json:"isVoice,omitempty"`
Timestamp time.Time `json:"timestamp"`
Mentions []Mention `json:"mentions,omitempty"`
Intent *IntentResult `json:"intent,omitempty"`
Source map[string]string `json:"source,omitempty"`
Embedding []float32 `json:"-"` // Pre-computed embedding for vector domain scoring
}
Utterance represents a transcribed speech segment from a human participant.
type VoiceInfo ¶
type VoiceInfo struct {
ID string `json:"id"`
Name string `json:"name"`
Language string `json:"language"`
Gender string `json:"gender"` // "male", "female", "neutral"
}
VoiceInfo describes an available TTS voice.
type VoicePlatform ¶
type VoicePlatform string
VoicePlatform identifies which voice provider flavor is active.
const ( // PlatformOpenAI uses OpenAI Realtime transcription (ASR) + OpenAI TTS API. // Current default; replaced by Deepgram (Nova-3 + Aura-2) in Stage 2 of // the Deepgram migration plan. PlatformOpenAI VoicePlatform = "openai" )