Documentation
¶
Index ¶
- func NormalizeLabel(val string) string
- func SerializeLabels(m map[string]string) string
- func TimeFormat(tt time.Time) string
- func TimeParse(s string) (time.Time, error)
- type CorpusMeta
- type CorpusReader
- type CorpusRecord
- type CorpusRepository
- type CorpusWriter
- type Cursor
- type DocReader
- type DocRepository
- type DocWriter
- type SchemaManager
- type SentenceIngest
- type TopicDeleter
- type TopicReader
- type TopicRepository
- type TopicWriter
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func NormalizeLabel ¶ added in v0.8.1
NormalizeLabel sanitizes a single label value for safe storage.
The corpus "labels" column uses comma-separated values (CSV). To prevent corruption, commas are stripped. Additionally, labels are lowercased and whitespace/hyphens are replaced with underscores for consistent matching and display.
This function must be called at every boundary where labels enter storage:
- CorpusStore.AddLabel (CLI path)
- SerializeLabels (epub ingest path)
func SerializeLabels ¶ added in v0.8.1
SerializeLabels converts a map of prefix → raw value into the comma-separated "prefix:normalized_value" format used by the corpus labels column. Empty values are skipped. The result is sorted alphabetically for deterministic output.
func TimeFormat ¶ added in v0.9.0
TimeFormat converts a time.Time to RFC3339 string in UTC. This should be used when sending time values to SQLite since it doesn't have a native datetime type. All timestamps in the database should use this format. Example: "2024-03-11T15:04:05Z"
Types ¶
type CorpusMeta ¶ added in v0.6.0
type CorpusMeta struct {
ID string // SHA-256 truncated hex of epub bytes
Epub string // epub file name (basename), used as source
Labels string // comma-separated DC labels
TxtHash string // SHA-256 hex of txt bytes
TxtCreatedAt time.Time
TxtEdit bool
TxtEditAt time.Time
TxtEditBy string
TxtEditNotes string
TxtAck bool
TxtAckAt time.Time
TxtAckBy string
NlpCreatedAt time.Time
NlpAck bool
NlpAckAt time.Time
NlpAckBy string
DeletedAt time.Time
CreatedAt time.Time
UpdatedAt time.Time
}
CorpusMeta holds all collected data collected for a single epub that will be inserted as one row in the corpus docs table, excluding heavy text/nlp fields.
func (CorpusMeta) HasAck ¶ added in v0.8.0
func (m CorpusMeta) HasAck() bool
func (CorpusMeta) HasNlp ¶ added in v0.8.0
func (m CorpusMeta) HasNlp() bool
HasNlp reports whether Nlp content has been generated and stored. The Nlp field is not even in the CorpusRecord — NlpCreatedAt is the authoritative signal for Nlp presence.
func (CorpusMeta) HasTxt ¶ added in v0.8.0
func (m CorpusMeta) HasTxt() bool
HasTxt reports whether plain-text content has been generated and stored. The Txt field is not populated by most queries due to its size — TxtHash is the authoritative signal for text presence.
type CorpusReader ¶ added in v0.6.0
type CorpusReader interface {
// List returns records (metadata only).
List() ([]CorpusMeta, error)
// ReadMeta retrieves full metadata for a given document ID.
ReadMeta(id string) (CorpusMeta, error)
// ReadTxt retrieves the txt field for a given document ID as raw bytes.
ReadTxt(id string) ([]byte, error)
// ReadNlp retrieves the raw NLP JSON payload for a given document ID.
ReadNlp(id string) ([]byte, error)
// ListLabels returns all labels (unique names) found in the corpus.
// If labelSubStr is not empty, only labels whose name contains the substring are returned.
ListLabels(labelSubStr string) ([]string, error)
// Exists returns true if a record with the given ID is present in the docs table.
Exists(id string) (bool, error)
}
CorpusReader defines read operations for corpus storage
type CorpusRecord ¶ added in v0.6.0
type CorpusRecord struct {
CorpusMeta
Txt string // full plain text from pandoc
Nlp string // raw NLP JSON payload
}
CorpusRecord holds all data collected for a single epub that will be inserted as one row in the corpus docs table.
type CorpusRepository ¶ added in v0.6.0
type CorpusRepository interface {
CorpusReader
CorpusWriter
}
CorpusRepository combines read and write operations
type CorpusWriter ¶ added in v0.6.0
type CorpusWriter interface {
// WriteStream inserts corpus records yielded by the iterator.
WriteStream(seq func(yield func(CorpusRecord, error) bool)) error
// WriteNlp stores the NLP JSON payload for the given document ID.
WriteNlp(id string, nlp []byte) error
// ClearNlp sets the nlp field to NULL for the given document ID.
ClearNlp(id string) error
// UpdateTxt updates the txt field and its associated metadata for the given document ID.
UpdateTxt(id string, txt []byte, txtHash string, by string, notes string) error
// AckTxt updates the txt_ack fields for the given document ID.
AckTxt(id string, by string) error
// AckNlp updates the nlp_ack fields for the given document ID.
AckNlp(id string, by string) error
// AddLabel adds labels to a document in the corpus.
AddLabel(id string, labels ...string) error
// DeleteLabel deletes labels from a document in the corpus.
DeleteLabel(id string, labels ...string) error
// Delete removes a document from the corpus by its ID.
Delete(id string) error
}
CorpusWriter defines write operations for corpus storage
type DocReader ¶
type DocReader interface {
// List returns document identity metadata (Id, Source).
List() ([]sent.Meta, error)
// Nlp returns sentences for a document by ID. Labels are not loaded.
Nlp(id string) ([]sent.Sentence, error)
// FindCandidates returns sentence candidates matching ALL given lemmas
// AND ALL labelIDs. The caller uses ListLabels() to obtain IDs.
FindCandidates(lemmas []string, labelIDs []int, after Cursor, limit int, onCandidate func(sent.Sentence) error) (Cursor, error)
// ListLabels returns all labels (ID and Name). If labelSubStr is not empty,
// only labels whose name contains the substring are returned.
ListLabels(labelSubStr string) (sent.Labels, error)
// HasSentences returns true if at least one sentence exists for the given doc ID.
HasSentences(id string) (bool, error)
// HasLabelsOptimization returns true if at least one sentence_labels row exists for the given doc ID.
HasLabelsOptimization(id string) (bool, error)
// HasLemmaOptimization returns true if at least one sentence_lemmas row exists for the given doc ID.
HasLemmaOptimization(id string) (bool, error)
// Exists returns true if a document with the given ID is present in the docs table.
Exists(id string) (bool, error)
}
DocReader defines read operations for document storage
type DocRepository ¶
DocRepository combines read and write operations
type DocWriter ¶
type DocWriter interface {
// WriteMeta persists document metadata (id, source) and its labels.
WriteMeta(id string, source string, labels []string) ([]int, error)
// UpdateLabels upserts the given labels into the labels table, then updates
// the docs row for docID with the new comma-separated label_ids.
// Returns the resolved label IDs in the same order as labels.
// Note: This operation may leave orphaned entries in the labels table if they
// are no longer referenced by any document.
UpdateLabels(docID string, labels []string) ([]int, error)
// WriteNlpData persists sentences for the given docID.
WriteNlpData(docID string, sentences []SentenceIngest) error
// WriteLabelsOptimization writes sentence_labels rows for the given docID.
WriteLabelsOptimization(docID string, labelIDs []int) error
// WriteLemmaOptimization writes sentence_lemmas rows for the given docID.
WriteLemmaOptimization(docID string, sentences []SentenceIngest) error
// DeleteLemmaOptimization removes all sentence_lemmas rows for the given docID.
// This is the live switch: after this call the document disappears from FindCandidates.
DeleteLemmaOptimization(docID string) error
// DeleteLabelsOptimization removes all sentence_labels rows for the given docID.
DeleteLabelsOptimization(docID string) error
// DeleteNlpData removes all sentences rows for the given docID.
DeleteNlpData(docID string) error
// DeleteMeta removes the docs row for the given docID.
// Labels in the labels table are shared and are not removed.
DeleteMeta(docID string) error
}
DocWriter defines write operations for document storage
type SchemaManager ¶ added in v0.9.0
type SchemaManager interface {
// Create applies the necessary schema definitions to the database.
Create(schemaName string) error
}
SchemaManager defines operations for managing the database schema/lifecycle.
type SentenceIngest ¶ added in v0.6.0
type SentenceIngest struct {
ID int `json:"id"`
Lemmas []string `json:"lemmas"`
Tokens json.RawMessage `json:"tokens"` // Avoids unmarshaling tokens early
}
SentenceIngest represents the flat parsed structure ready for insertion.
type TopicDeleter ¶ added in v0.10.0
type TopicDeleter interface {
// Delete removes a topic from storage by name
Delete(name string) error
}
TopicDeleter defines delete operations for topic storage
type TopicReader ¶
type TopicReader interface {
// ReadAll returns all topics from storage
ReadAll() (topic.Library, error)
// Read returns a single topic by name
Read(name string) (topic.Topic, error)
}
TopicReader defines read operations for topic storage
type TopicRepository ¶
type TopicRepository interface {
TopicReader
TopicWriter
}
TopicRepository combines read and write operations
type TopicWriter ¶
TopicWriter defines write operations for topic storage