storage

package
v0.10.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 29, 2026 License: MIT Imports: 7 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func NormalizeLabel added in v0.8.1

func NormalizeLabel(val string) string

NormalizeLabel sanitizes a single label value for safe storage.

The corpus "labels" column uses comma-separated values (CSV). To prevent corruption, commas are stripped. Additionally, labels are lowercased and whitespace/hyphens are replaced with underscores for consistent matching and display.

This function must be called at every boundary where labels enter storage:

  • CorpusStore.AddLabel (CLI path)
  • SerializeLabels (epub ingest path)

func SerializeLabels added in v0.8.1

func SerializeLabels(m map[string]string) string

SerializeLabels converts a map of prefix → raw value into the comma-separated "prefix:normalized_value" format used by the corpus labels column. Empty values are skipped. The result is sorted alphabetically for deterministic output.

func TimeFormat added in v0.9.0

func TimeFormat(tt time.Time) string

TimeFormat converts a time.Time to RFC3339 string in UTC. This should be used when sending time values to SQLite since it doesn't have a native datetime type. All timestamps in the database should use this format. Example: "2024-03-11T15:04:05Z"

func TimeParse added in v0.6.0

func TimeParse(s string) (time.Time, error)

TimeParse parses a RFC3339 string into a time.Time. This should be used when reading timestamps from SQLite to convert them back to time.Time values. Returns an error if the input string is not in RFC3339 format.

Types

type CorpusMeta added in v0.6.0

type CorpusMeta struct {
	ID           string // SHA-256 truncated hex of epub bytes
	Epub         string // epub file name (basename), used as source
	Labels       string // comma-separated DC labels
	TxtHash      string // SHA-256 hex of txt bytes
	TxtCreatedAt time.Time
	TxtEdit      bool
	TxtEditAt    time.Time
	TxtEditBy    string
	TxtEditNotes string
	TxtAck       bool
	TxtAckAt     time.Time
	TxtAckBy     string
	NlpCreatedAt time.Time
	NlpAck       bool
	NlpAckAt     time.Time
	NlpAckBy     string
	DeletedAt    time.Time
	CreatedAt    time.Time
	UpdatedAt    time.Time
}

CorpusMeta holds all collected data collected for a single epub that will be inserted as one row in the corpus docs table, excluding heavy text/nlp fields.

func (CorpusMeta) HasAck added in v0.8.0

func (m CorpusMeta) HasAck() bool

func (CorpusMeta) HasNlp added in v0.8.0

func (m CorpusMeta) HasNlp() bool

HasNlp reports whether Nlp content has been generated and stored. The Nlp field is not even in the CorpusRecord — NlpCreatedAt is the authoritative signal for Nlp presence.

func (CorpusMeta) HasTxt added in v0.8.0

func (m CorpusMeta) HasTxt() bool

HasTxt reports whether plain-text content has been generated and stored. The Txt field is not populated by most queries due to its size — TxtHash is the authoritative signal for text presence.

type CorpusReader added in v0.6.0

type CorpusReader interface {
	// List returns records (metadata only).
	List() ([]CorpusMeta, error)

	// ReadMeta retrieves full metadata for a given document ID.
	ReadMeta(id string) (CorpusMeta, error)

	// ReadTxt retrieves the txt field for a given document ID as raw bytes.
	ReadTxt(id string) ([]byte, error)

	// ReadNlp retrieves the raw NLP JSON payload for a given document ID.
	ReadNlp(id string) ([]byte, error)

	// ListLabels returns all labels (unique names) found in the corpus.
	// If labelSubStr is not empty, only labels whose name contains the substring are returned.
	ListLabels(labelSubStr string) ([]string, error)

	// Exists returns true if a record with the given ID is present in the docs table.
	Exists(id string) (bool, error)
}

CorpusReader defines read operations for corpus storage

type CorpusRecord added in v0.6.0

type CorpusRecord struct {
	CorpusMeta
	Txt string // full plain text from pandoc
	Nlp string // raw NLP JSON payload
}

CorpusRecord holds all data collected for a single epub that will be inserted as one row in the corpus docs table.

type CorpusRepository added in v0.6.0

type CorpusRepository interface {
	CorpusReader
	CorpusWriter
}

CorpusRepository combines read and write operations

type CorpusWriter added in v0.6.0

type CorpusWriter interface {
	// WriteStream inserts corpus records yielded by the iterator.
	WriteStream(seq func(yield func(CorpusRecord, error) bool)) error

	// WriteNlp stores the NLP JSON payload for the given document ID.
	WriteNlp(id string, nlp []byte) error

	// ClearNlp sets the nlp field to NULL for the given document ID.
	ClearNlp(id string) error

	// UpdateTxt updates the txt field and its associated metadata for the given document ID.
	UpdateTxt(id string, txt []byte, txtHash string, by string, notes string) error

	// AckTxt updates the txt_ack fields for the given document ID.
	AckTxt(id string, by string) error

	// AckNlp updates the nlp_ack fields for the given document ID.
	AckNlp(id string, by string) error

	// AddLabel adds labels to a document in the corpus.
	AddLabel(id string, labels ...string) error

	// DeleteLabel deletes labels from a document in the corpus.
	DeleteLabel(id string, labels ...string) error

	// Delete removes a document from the corpus by its ID.
	Delete(id string) error
}

CorpusWriter defines write operations for corpus storage

type Cursor

type Cursor int64

Cursor for paginated lemma-based queries

type DocReader

type DocReader interface {
	// List returns document identity metadata (Id, Source).
	List() ([]sent.Meta, error)

	// Nlp returns sentences for a document by ID. Labels are not loaded.
	Nlp(id string) ([]sent.Sentence, error)

	// FindCandidates returns sentence candidates matching ALL given lemmas
	// AND ALL labelIDs. The caller uses ListLabels() to obtain IDs.
	FindCandidates(lemmas []string, labelIDs []int, after Cursor, limit int, onCandidate func(sent.Sentence) error) (Cursor, error)

	// ListLabels returns all labels (ID and Name). If labelSubStr is not empty,
	// only labels whose name contains the substring are returned.
	ListLabels(labelSubStr string) (sent.Labels, error)

	// HasSentences returns true if at least one sentence exists for the given doc ID.
	HasSentences(id string) (bool, error)

	// HasLabelsOptimization returns true if at least one sentence_labels row exists for the given doc ID.
	HasLabelsOptimization(id string) (bool, error)

	// HasLemmaOptimization returns true if at least one sentence_lemmas row exists for the given doc ID.
	HasLemmaOptimization(id string) (bool, error)

	// Exists returns true if a document with the given ID is present in the docs table.
	Exists(id string) (bool, error)
}

DocReader defines read operations for document storage

type DocRepository

type DocRepository interface {
	DocReader
	DocWriter
}

DocRepository combines read and write operations

type DocWriter

type DocWriter interface {
	// WriteMeta persists document metadata (id, source) and its labels.
	WriteMeta(id string, source string, labels []string) ([]int, error)

	// UpdateLabels upserts the given labels into the labels table, then updates
	// the docs row for docID with the new comma-separated label_ids.
	// Returns the resolved label IDs in the same order as labels.
	// Note: This operation may leave orphaned entries in the labels table if they
	// are no longer referenced by any document.
	UpdateLabels(docID string, labels []string) ([]int, error)

	// WriteNlpData persists sentences for the given docID.
	WriteNlpData(docID string, sentences []SentenceIngest) error

	// WriteLabelsOptimization writes sentence_labels rows for the given docID.
	WriteLabelsOptimization(docID string, labelIDs []int) error

	// WriteLemmaOptimization writes sentence_lemmas rows for the given docID.
	WriteLemmaOptimization(docID string, sentences []SentenceIngest) error

	// DeleteLemmaOptimization removes all sentence_lemmas rows for the given docID.
	// This is the live switch: after this call the document disappears from FindCandidates.
	DeleteLemmaOptimization(docID string) error

	// DeleteLabelsOptimization removes all sentence_labels rows for the given docID.
	DeleteLabelsOptimization(docID string) error

	// DeleteNlpData removes all sentences rows for the given docID.
	DeleteNlpData(docID string) error

	// DeleteMeta removes the docs row for the given docID.
	// Labels in the labels table are shared and are not removed.
	DeleteMeta(docID string) error
}

DocWriter defines write operations for document storage

type SchemaManager added in v0.9.0

type SchemaManager interface {
	// Create applies the necessary schema definitions to the database.
	Create(schemaName string) error
}

SchemaManager defines operations for managing the database schema/lifecycle.

type SentenceIngest added in v0.6.0

type SentenceIngest struct {
	ID     int             `json:"id"`
	Lemmas []string        `json:"lemmas"`
	Tokens json.RawMessage `json:"tokens"` // Avoids unmarshaling tokens early
}

SentenceIngest represents the flat parsed structure ready for insertion.

type TopicDeleter added in v0.10.0

type TopicDeleter interface {
	// Delete removes a topic from storage by name
	Delete(name string) error
}

TopicDeleter defines delete operations for topic storage

type TopicReader

type TopicReader interface {
	// ReadAll returns all topics from storage
	ReadAll() (topic.Library, error)

	// Read returns a single topic by name
	Read(name string) (topic.Topic, error)
}

TopicReader defines read operations for topic storage

type TopicRepository

type TopicRepository interface {
	TopicReader
	TopicWriter
}

TopicRepository combines read and write operations

type TopicWriter

type TopicWriter interface {
	// Write persists a topic to storage
	Write(tp topic.Topic) error
}

TopicWriter defines write operations for topic storage

Directories

Path Synopsis
sqlite

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL