Documentation
¶
Index ¶
- Variables
- func GetApostrophe(input string) string
- func GetDefaultLookupMap() map[string][]string
- func LoadLookupMap(filePath string) (map[string][]string, error)
- func LoadWordList(filePath string) ([]string, error)
- func NormalizeForLM(s string) string
- type AtomicCounter
- type Candidate
- type Candidates
- type CaseType
- type CharMatcher
- type CharacterGraph
- type CharacterGraphDecoder
- type DiacriticsIgnoringMatcher
- type Hypothesis
- type Node
- func (n *Node) AddChild(index int, c rune, nodeType NodeType, word string) *Node
- func (n *Node) ConnectEpsilon(node *Node) bool
- func (n *Node) Debug() string
- func (n *Node) Equals(other *Node) bool
- func (n *Node) GetAllChildNodes() []*Node
- func (n *Node) GetChildList(c rune) []*Node
- func (n *Node) GetChildListMulti(chars []rune) []*Node
- func (n *Node) GetImmediateChild(c rune) *Node
- func (n *Node) GetImmediateChildNodeIterable() []*Node
- func (n *Node) GetImmediateChildNodes() []*Node
- func (n *Node) HasChild(c rune) bool
- func (n *Node) HasEpsilonConnection() bool
- func (n *Node) HasImmediateChild(c rune) bool
- func (n *Node) Hash() int
- func (n *Node) String() string
- type NodeType
- type Operation
- type StemEndingGraph
- type TurkishSentenceNormalizer
- type TurkishSentenceNormalizerAdvanced
- type TurkishSentenceNormalizerEnhanced
- type TurkishSentenceNormalizerWithLexicon
- type TurkishSpellChecker
- func (tsc *TurkishSpellChecker) Check(word string) bool
- func (tsc *TurkishSpellChecker) RankByFrequency(suggestions []string, frequencies map[string]int) []string
- func (tsc *TurkishSpellChecker) SuggestForWord(word string) []string
- func (tsc *TurkishSpellChecker) SuggestForWordForNormalization(word string, leftContext string, rightContext string) []string
- func (tsc *TurkishSpellChecker) SuggestForWordWithContext(word string, previous string, next string) []string
Constants ¶
This section is empty.
Variables ¶
var DiacriticsIgnoringMatcherInstance = NewDiacriticsIgnoringMatcher()
DiacriticsIgnoringMatcherInstance is the singleton instance
Functions ¶
func GetApostrophe ¶
GetApostrophe returns the apostrophe character used in word
func GetDefaultLookupMap ¶
GetDefaultLookupMap returns the embedded default lookup map for common normalizations
func LoadLookupMap ¶
LoadLookupMap loads a normalization lookup map from a file Format examples:
tmm = tamam iyi=ıyı,iyi ole=oley,öyle,öle
func LoadWordList ¶
LoadWordList loads a simple word list (one word per line)
func NormalizeForLM ¶
NormalizeForLM normalizes word for language model
Types ¶
type AtomicCounter ¶
type AtomicCounter struct {
// contains filtered or unexported fields
}
AtomicCounter provides thread-safe counter
func NewAtomicCounter ¶
func NewAtomicCounter() *AtomicCounter
NewAtomicCounter creates a new atomic counter
func (*AtomicCounter) GetAndIncrement ¶
func (ac *AtomicCounter) GetAndIncrement() int
GetAndIncrement atomically gets current value and increments
type Candidate ¶
Candidate represents a normalization candidate
func GetEndCandidate ¶
func GetEndCandidate() *Candidate
GetEndCandidate returns the END sentinel candidate
func GetStartCandidate ¶
func GetStartCandidate() *Candidate
GetStartCandidate returns the START sentinel candidate
func NewCandidate ¶
NewCandidate creates a new candidate
type Candidates ¶
Candidates represents multiple candidates for a word
func GetEndCandidates ¶
func GetEndCandidates() *Candidates
GetEndCandidates returns the END candidates structure
func NewCandidates ¶
func NewCandidates(word string, candidates []*Candidate) *Candidates
NewCandidates creates a new candidates structure
type CharMatcher ¶
CharMatcher interface for character matching strategies
type CharacterGraph ¶
type CharacterGraph struct {
Root *Node
// contains filtered or unexported fields
}
CharacterGraph represents a graph structure for character-based operations
func NewCharacterGraph ¶
func NewCharacterGraph() *CharacterGraph
NewCharacterGraph creates a new character graph
func (*CharacterGraph) AddWord ¶
func (cg *CharacterGraph) AddWord(word string, nodeType NodeType) *Node
AddWord adds a word to the graph with given type
func (*CharacterGraph) ContainsWord ¶
func (cg *CharacterGraph) ContainsWord(word string) bool
ContainsWord checks if graph contains a word
func (*CharacterGraph) GetAllNodes ¶
func (cg *CharacterGraph) GetAllNodes() []*Node
GetAllNodes returns all nodes in the graph that have words
func (*CharacterGraph) GetNode ¶
func (cg *CharacterGraph) GetNode(word string) *Node
GetNode returns the node corresponding to the given word, or nil if not found
func (*CharacterGraph) GetNodeCount ¶
func (cg *CharacterGraph) GetNodeCount() int
GetNodeCount returns total number of nodes in graph
type CharacterGraphDecoder ¶
type CharacterGraphDecoder struct {
Graph *CharacterGraph
MaxPenalty float64
CheckNearKeySubstitution bool
}
CharacterGraphDecoder decodes strings using character graph with error tolerance
func NewCharacterGraphDecoder ¶
func NewCharacterGraphDecoder(graph *CharacterGraph) *CharacterGraphDecoder
NewCharacterGraphDecoder creates a new decoder
func (*CharacterGraphDecoder) GetSuggestions ¶
func (cgd *CharacterGraphDecoder) GetSuggestions(input string, matcher CharMatcher) []string
GetSuggestions returns suggestions for input string using given matcher
type DiacriticsIgnoringMatcher ¶
type DiacriticsIgnoringMatcher struct {
// contains filtered or unexported fields
}
DiacriticsIgnoringMatcher matches characters ignoring diacritics
func NewDiacriticsIgnoringMatcher ¶
func NewDiacriticsIgnoringMatcher() *DiacriticsIgnoringMatcher
NewDiacriticsIgnoringMatcher creates a new diacritics ignoring matcher
func (*DiacriticsIgnoringMatcher) Matches ¶
func (dim *DiacriticsIgnoringMatcher) Matches(c rune) []rune
Matches returns possible character matches for given character
type Hypothesis ¶
type Hypothesis struct {
History []*Candidate
Current *Candidate
Previous *Hypothesis
Score float32
}
Hypothesis represents a normalization hypothesis in beam search
func GetBestHypothesis ¶
func GetBestHypothesis(hypotheses []*Hypothesis) *Hypothesis
GetBestHypothesis returns best hypothesis from list
func (*Hypothesis) Equals ¶
func (h *Hypothesis) Equals(other *Hypothesis) bool
Equals checks if two hypotheses are equal
type Node ¶
type Node struct {
Index int
Char rune
Type NodeType
Word string
EpsilonNodes []*Node
Nodes map[rune]*Node
}
Node represents a node in the character graph
func (*Node) ConnectEpsilon ¶
ConnectEpsilon connects this node to another node via epsilon transition
func (*Node) GetAllChildNodes ¶
GetAllChildNodes returns all child nodes including epsilon-connected nodes
func (*Node) GetChildList ¶
GetChildList returns list of children matching given character
func (*Node) GetChildListMulti ¶
GetChildListMulti returns list of children matching any of given characters
func (*Node) GetImmediateChild ¶
GetImmediateChild returns immediate child node for given character
func (*Node) GetImmediateChildNodeIterable ¶
GetImmediateChildNodeIterable returns all immediate child nodes (same as GetImmediateChildNodes)
func (*Node) GetImmediateChildNodes ¶
GetImmediateChildNodes returns all immediate child nodes
func (*Node) HasChild ¶
HasChild checks if node has a child with given character (including epsilon nodes)
func (*Node) HasEpsilonConnection ¶
HasEpsilonConnection checks if node has epsilon connections
func (*Node) HasImmediateChild ¶
HasImmediateChild checks if node has an immediate child with given character
type StemEndingGraph ¶
type StemEndingGraph struct {
EndingGraph *CharacterGraph
StemGraph *CharacterGraph
// contains filtered or unexported fields
}
StemEndingGraph creates a character graph from stems and endings
func NewStemEndingGraph ¶
func NewStemEndingGraph(stemWords []string, endingsPath string) (*StemEndingGraph, error)
NewStemEndingGraph creates a new stem-ending graph
func NewStemEndingGraphFromMorphology ¶
func NewStemEndingGraphFromMorphology(morph *morphology.TurkishMorphology, endingsPath string) (*StemEndingGraph, error)
NewStemEndingGraphFromMorphology creates a stem-ending graph from TurkishMorphology This matches Java's implementation: extracting stems from morphology
func (*StemEndingGraph) GetEndingGraph ¶
func (seg *StemEndingGraph) GetEndingGraph() *CharacterGraph
GetEndingGraph returns the ending graph
func (*StemEndingGraph) GetStemGraph ¶
func (seg *StemEndingGraph) GetStemGraph() *CharacterGraph
GetStemGraph returns the stem graph
type TurkishSentenceNormalizer ¶
type TurkishSentenceNormalizer struct {
SpellChecker *TurkishSpellChecker
Replacements map[string]string
NoSplitWords map[string]bool
CommonSplits map[string]string
CommonConnectedSuffixes map[string]bool
LookupManual map[string][]string
LookupFromGraph map[string][]string
LookupFromASCII map[string][]string
AlwaysApplyDeasciifier bool
// contains filtered or unexported fields
}
TurkishSentenceNormalizer normalizes informal Turkish sentences
func NewTurkishSentenceNormalizer ¶
func NewTurkishSentenceNormalizer(stemWords []string, resourcesPath string) (*TurkishSentenceNormalizer, error)
NewTurkishSentenceNormalizer creates a new sentence normalizer
func (*TurkishSentenceNormalizer) Normalize ¶
func (tsn *TurkishSentenceNormalizer) Normalize(sentence string) string
Normalize normalizes a sentence
func (*TurkishSentenceNormalizer) NormalizeWithBeamSearch ¶
func (tsn *TurkishSentenceNormalizer) NormalizeWithBeamSearch(sentence string) string
NormalizeWithBeamSearch normalizes sentence using beam search (simplified without LM)
type TurkishSentenceNormalizerAdvanced ¶
type TurkishSentenceNormalizerAdvanced struct {
SpellChecker *TurkishSpellChecker
Morphology *morphology.TurkishMorphology
InformalMorphology *morphology.TurkishMorphology
AnalysisConverter *analysis.InformalAnalysisConverter
LanguageModel lm.LanguageModel
Replacements map[string]string
NoSplitWords map[string]bool
CommonSplits map[string]string
CommonConnectedSuffixes map[string]bool
LookupManual map[string][]string
LookupFromGraph map[string][]string
LookupFromASCII map[string][]string
AlwaysApplyDeasciifier bool
}
TurkishSentenceNormalizerAdvanced normalizes informal Turkish sentences with full morphology support
func NewTurkishSentenceNormalizerAdvanced ¶
func NewTurkishSentenceNormalizerAdvanced(morph *morphology.TurkishMorphology, dataRoot string) (*TurkishSentenceNormalizerAdvanced, error)
NewTurkishSentenceNormalizerAdvanced creates a new advanced sentence normalizer with morphology
func (*TurkishSentenceNormalizerAdvanced) Normalize ¶
func (tsn *TurkishSentenceNormalizerAdvanced) Normalize(sentence string) string
Normalize normalizes a sentence using full morphological analysis and beam search
type TurkishSentenceNormalizerEnhanced ¶
type TurkishSentenceNormalizerEnhanced struct {
LookupManual map[string][]string
WordDictionary map[string]bool
SpellChecker *CharacterGraphDecoder
Graph *CharacterGraph
}
TurkishSentenceNormalizerEnhanced is an improved normalizer with lookup tables and dictionary support
func NewTurkishSentenceNormalizerEnhanced ¶
func NewTurkishSentenceNormalizerEnhanced() (*TurkishSentenceNormalizerEnhanced, error)
NewTurkishSentenceNormalizerEnhanced creates a new enhanced normalizer
func (*TurkishSentenceNormalizerEnhanced) Normalize ¶
func (tsne *TurkishSentenceNormalizerEnhanced) Normalize(sentence string) string
Normalize normalizes a Turkish sentence
type TurkishSentenceNormalizerWithLexicon ¶
type TurkishSentenceNormalizerWithLexicon struct {
LookupManual map[string][]string
Lexicon *lexicon.RootLexicon
WordDictionary map[string]bool
SpellChecker *CharacterGraphDecoder
Graph *CharacterGraph
}
TurkishSentenceNormalizerWithLexicon uses full lexicon for normalization
func NewTurkishSentenceNormalizerWithLexicon ¶
func NewTurkishSentenceNormalizerWithLexicon() (*TurkishSentenceNormalizerWithLexicon, error)
NewTurkishSentenceNormalizerWithLexicon creates normalizer with full lexicon
func (*TurkishSentenceNormalizerWithLexicon) Normalize ¶
func (tsnl *TurkishSentenceNormalizerWithLexicon) Normalize(sentence string) string
Normalize normalizes a Turkish sentence
type TurkishSpellChecker ¶
type TurkishSpellChecker struct {
Decoder *CharacterGraphDecoder
CharMatcher CharMatcher
Morphology interface{} // *morphology.TurkishMorphology
LanguageModel lm.LanguageModel
// contains filtered or unexported fields
}
TurkishSpellChecker provides spell checking and suggestion functionality
func NewTurkishSpellChecker ¶
func NewTurkishSpellChecker(stemWords []string, endingsPath string, matcher CharMatcher) (*TurkishSpellChecker, error)
NewTurkishSpellChecker creates a new spell checker
func (*TurkishSpellChecker) Check ¶
func (tsc *TurkishSpellChecker) Check(word string) bool
Check checks if word is spelled correctly
func (*TurkishSpellChecker) RankByFrequency ¶
func (tsc *TurkishSpellChecker) RankByFrequency(suggestions []string, frequencies map[string]int) []string
RankByFrequency ranks suggestions by frequency (requires frequency map)
func (*TurkishSpellChecker) SuggestForWord ¶
func (tsc *TurkishSpellChecker) SuggestForWord(word string) []string
SuggestForWord returns suggestions for a misspelled word
func (*TurkishSpellChecker) SuggestForWordForNormalization ¶
func (tsc *TurkishSpellChecker) SuggestForWordForNormalization(word string, leftContext string, rightContext string) []string
SuggestForWordForNormalization returns suggestions for normalization (alias)
func (*TurkishSpellChecker) SuggestForWordWithContext ¶
func (tsc *TurkishSpellChecker) SuggestForWordWithContext(word string, previous string, next string) []string
SuggestForWordWithContext returns suggestions with context awareness