normalization

package

v0.1.4 Latest Latest Go to latest Published: Nov 9, 2025 License: Apache-2.0 Imports: 19 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/kalaomer/zemberek-go

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func GetApostrophe(input string) string
func GetDefaultLookupMap() map[string][]string
func LoadLookupMap(filePath string) (map[string][]string, error)
func LoadWordList(filePath string) ([]string, error)
func NormalizeForLM(s string) string
type AtomicCounter
- func NewAtomicCounter() *AtomicCounter
- func (ac *AtomicCounter) GetAndIncrement() int
type Candidate
- func GetEndCandidate() *Candidate
- func GetStartCandidate() *Candidate
- func NewCandidate(content string) *Candidate
type Candidates
- func GetEndCandidates() *Candidates
- func NewCandidates(word string, candidates []*Candidate) *Candidates
type CaseType
type CharMatcher
type CharacterGraph
- func NewCharacterGraph() *CharacterGraph
- func (cg *CharacterGraph) AddWord(word string, nodeType NodeType) *Node
- func (cg *CharacterGraph) ContainsWord(word string) bool
- func (cg *CharacterGraph) GetAllNodes() []*Node
- func (cg *CharacterGraph) GetNode(word string) *Node
- func (cg *CharacterGraph) GetNodeCount() int
type CharacterGraphDecoder
- func NewCharacterGraphDecoder(graph *CharacterGraph) *CharacterGraphDecoder
- func (cgd *CharacterGraphDecoder) GetSuggestions(input string, matcher CharMatcher) []string
type DiacriticsIgnoringMatcher
- func NewDiacriticsIgnoringMatcher() *DiacriticsIgnoringMatcher
- func (dim *DiacriticsIgnoringMatcher) Matches(c rune) []rune
type Hypothesis
- func GetBestHypothesis(hypotheses []*Hypothesis) *Hypothesis
- func NewHypothesis() *Hypothesis
- func (h *Hypothesis) Equals(other *Hypothesis) bool
- func (h *Hypothesis) Hash() int
type Node
- func NewNode(index int, char rune, nodeType NodeType, word string) *Node
- func (n *Node) AddChild(index int, c rune, nodeType NodeType, word string) *Node
- func (n *Node) ConnectEpsilon(node *Node) bool
- func (n *Node) Debug() string
- func (n *Node) Equals(other *Node) bool
- func (n *Node) GetAllChildNodes() []*Node
- func (n *Node) GetChildList(c rune) []*Node
- func (n *Node) GetChildListMulti(chars []rune) []*Node
- func (n *Node) GetImmediateChild(c rune) *Node
- func (n *Node) GetImmediateChildNodeIterable() []*Node
- func (n *Node) GetImmediateChildNodes() []*Node
- func (n *Node) HasChild(c rune) bool
- func (n *Node) HasEpsilonConnection() bool
- func (n *Node) HasImmediateChild(c rune) bool
- func (n *Node) Hash() int
- func (n *Node) String() string
type NodeType
type Operation
type StemEndingGraph
- func NewStemEndingGraph(stemWords []string, endingsPath string) (*StemEndingGraph, error)
- func NewStemEndingGraphFromMorphology(morph *morphology.TurkishMorphology, endingsPath string) (*StemEndingGraph, error)
- func (seg *StemEndingGraph) GetEndingGraph() *CharacterGraph
- func (seg *StemEndingGraph) GetStemGraph() *CharacterGraph
type TurkishSentenceNormalizer
- func NewTurkishSentenceNormalizer(stemWords []string, resourcesPath string) (*TurkishSentenceNormalizer, error)
- func (tsn *TurkishSentenceNormalizer) Normalize(sentence string) string
- func (tsn *TurkishSentenceNormalizer) NormalizeWithBeamSearch(sentence string) string
type TurkishSentenceNormalizerAdvanced
- func NewTurkishSentenceNormalizerAdvanced(morph *morphology.TurkishMorphology, dataRoot string) (*TurkishSentenceNormalizerAdvanced, error)
- func (tsn *TurkishSentenceNormalizerAdvanced) Normalize(sentence string) string
type TurkishSentenceNormalizerEnhanced
- func NewTurkishSentenceNormalizerEnhanced() (*TurkishSentenceNormalizerEnhanced, error)
- func (tsne *TurkishSentenceNormalizerEnhanced) Normalize(sentence string) string
type TurkishSentenceNormalizerWithLexicon
- func NewTurkishSentenceNormalizerWithLexicon() (*TurkishSentenceNormalizerWithLexicon, error)
- func (tsnl *TurkishSentenceNormalizerWithLexicon) Normalize(sentence string) string
type TurkishSpellChecker
- func NewTurkishSpellChecker(stemWords []string, endingsPath string, matcher CharMatcher) (*TurkishSpellChecker, error)
- func (tsc *TurkishSpellChecker) Check(word string) bool
- func (tsc *TurkishSpellChecker) RankByFrequency(suggestions []string, frequencies map[string]int) []string
- func (tsc *TurkishSpellChecker) SuggestForWord(word string) []string
- func (tsc *TurkishSpellChecker) SuggestForWordForNormalization(word string, leftContext string, rightContext string) []string
- func (tsc *TurkishSpellChecker) SuggestForWordWithContext(word string, previous string, next string) []string

Constants ¶

This section is empty.

Variables ¶

View Source

var DiacriticsIgnoringMatcherInstance = NewDiacriticsIgnoringMatcher()

DiacriticsIgnoringMatcherInstance is the singleton instance

Functions ¶

func GetApostrophe ¶

func GetApostrophe(input string) string

GetApostrophe returns the apostrophe character used in word

func GetDefaultLookupMap ¶

func GetDefaultLookupMap() map[string][]string

GetDefaultLookupMap returns the embedded default lookup map for common normalizations

func LoadLookupMap ¶

func LoadLookupMap(filePath string) (map[string][]string, error)

LoadLookupMap loads a normalization lookup map from a file Format examples:

tmm = tamam
iyi=ıyı,iyi
ole=oley,öyle,öle

func LoadWordList ¶

func LoadWordList(filePath string) ([]string, error)

LoadWordList loads a simple word list (one word per line)

func NormalizeForLM ¶

func NormalizeForLM(s string) string

NormalizeForLM normalizes word for language model

Types ¶

type AtomicCounter ¶

type AtomicCounter struct {
	// contains filtered or unexported fields
}

AtomicCounter provides thread-safe counter

func NewAtomicCounter ¶

func NewAtomicCounter() *AtomicCounter

NewAtomicCounter creates a new atomic counter

func (*AtomicCounter) GetAndIncrement ¶

func (ac *AtomicCounter) GetAndIncrement() int

GetAndIncrement atomically gets current value and increments

type Candidate ¶

type Candidate struct {
	Content string
	Score   float32
}

Candidate represents a normalization candidate

func GetEndCandidate ¶

func GetEndCandidate() *Candidate

GetEndCandidate returns the END sentinel candidate

func GetStartCandidate ¶

func GetStartCandidate() *Candidate

GetStartCandidate returns the START sentinel candidate

func NewCandidate ¶

func NewCandidate(content string) *Candidate

NewCandidate creates a new candidate

type Candidates ¶

type Candidates struct {
	Word       string
	Candidates []*Candidate
}

Candidates represents multiple candidates for a word

func GetEndCandidates ¶

func GetEndCandidates() *Candidates

GetEndCandidates returns the END candidates structure

func NewCandidates ¶

func NewCandidates(word string, candidates []*Candidate) *Candidates

NewCandidates creates a new candidates structure

type CaseType ¶

type CaseType int

CaseType represents text case

const (
	DefaultCase CaseType = iota
	LowerCase
	UpperCase
	TitleCase
	MixedCase
)

type CharMatcher ¶

type CharMatcher interface {
	Matches(c rune) []rune
}

CharMatcher interface for character matching strategies

type CharacterGraph ¶

type CharacterGraph struct {
	Root *Node
	// contains filtered or unexported fields
}

CharacterGraph represents a graph structure for character-based operations

func NewCharacterGraph ¶

func NewCharacterGraph() *CharacterGraph

NewCharacterGraph creates a new character graph

func (*CharacterGraph) AddWord ¶

func (cg *CharacterGraph) AddWord(word string, nodeType NodeType) *Node

AddWord adds a word to the graph with given type

func (*CharacterGraph) ContainsWord ¶

func (cg *CharacterGraph) ContainsWord(word string) bool

ContainsWord checks if graph contains a word

func (*CharacterGraph) GetAllNodes ¶

func (cg *CharacterGraph) GetAllNodes() []*Node

GetAllNodes returns all nodes in the graph that have words

func (*CharacterGraph) GetNode ¶

func (cg *CharacterGraph) GetNode(word string) *Node

GetNode returns the node corresponding to the given word, or nil if not found

func (*CharacterGraph) GetNodeCount ¶

func (cg *CharacterGraph) GetNodeCount() int

GetNodeCount returns total number of nodes in graph

type CharacterGraphDecoder ¶

type CharacterGraphDecoder struct {
	Graph                    *CharacterGraph
	MaxPenalty               float64
	CheckNearKeySubstitution bool
}

CharacterGraphDecoder decodes strings using character graph with error tolerance

func NewCharacterGraphDecoder ¶

func NewCharacterGraphDecoder(graph *CharacterGraph) *CharacterGraphDecoder

NewCharacterGraphDecoder creates a new decoder

func (*CharacterGraphDecoder) GetSuggestions ¶

func (cgd *CharacterGraphDecoder) GetSuggestions(input string, matcher CharMatcher) []string

GetSuggestions returns suggestions for input string using given matcher

type DiacriticsIgnoringMatcher ¶

type DiacriticsIgnoringMatcher struct {
	// contains filtered or unexported fields
}

DiacriticsIgnoringMatcher matches characters ignoring diacritics

func NewDiacriticsIgnoringMatcher ¶

func NewDiacriticsIgnoringMatcher() *DiacriticsIgnoringMatcher

NewDiacriticsIgnoringMatcher creates a new diacritics ignoring matcher

func (*DiacriticsIgnoringMatcher) Matches ¶

func (dim *DiacriticsIgnoringMatcher) Matches(c rune) []rune

Matches returns possible character matches for given character

type Hypothesis ¶

type Hypothesis struct {
	History  []*Candidate
	Current  *Candidate
	Previous *Hypothesis
	Score    float32
}

Hypothesis represents a normalization hypothesis in beam search

func GetBestHypothesis ¶

func GetBestHypothesis(hypotheses []*Hypothesis) *Hypothesis

GetBestHypothesis returns best hypothesis from list

func NewHypothesis ¶

func NewHypothesis() *Hypothesis

NewHypothesis creates a new hypothesis

func (*Hypothesis) Equals ¶

func (h *Hypothesis) Equals(other *Hypothesis) bool

Equals checks if two hypotheses are equal

func (*Hypothesis) Hash ¶

func (h *Hypothesis) Hash() int

Hash returns hash for hypothesis

type Node ¶

type Node struct {
	Index        int
	Char         rune
	Type         NodeType
	Word         string
	EpsilonNodes []*Node
	Nodes        map[rune]*Node
}

Node represents a node in the character graph

func NewNode ¶

func NewNode(index int, char rune, nodeType NodeType, word string) *Node

NewNode creates a new node

func (*Node) AddChild ¶

func (n *Node) AddChild(index int, c rune, nodeType NodeType, word string) *Node

AddChild adds or updates a child node

func (*Node) ConnectEpsilon ¶

func (n *Node) ConnectEpsilon(node *Node) bool

ConnectEpsilon connects this node to another node via epsilon transition

func (*Node) Debug ¶

func (n *Node) Debug() string

Debug returns detailed debug string

func (*Node) Equals ¶

func (n *Node) Equals(other *Node) bool

Equals checks equality based on index

func (*Node) GetAllChildNodes ¶

func (n *Node) GetAllChildNodes() []*Node

GetAllChildNodes returns all child nodes including epsilon-connected nodes

func (*Node) GetChildList ¶

func (n *Node) GetChildList(c rune) []*Node

GetChildList returns list of children matching given character

func (*Node) GetChildListMulti ¶

func (n *Node) GetChildListMulti(chars []rune) []*Node

GetChildListMulti returns list of children matching any of given characters

func (*Node) GetImmediateChild ¶

func (n *Node) GetImmediateChild(c rune) *Node

GetImmediateChild returns immediate child node for given character

func (*Node) GetImmediateChildNodeIterable ¶

func (n *Node) GetImmediateChildNodeIterable() []*Node

GetImmediateChildNodeIterable returns all immediate child nodes (same as GetImmediateChildNodes)

func (*Node) GetImmediateChildNodes ¶

func (n *Node) GetImmediateChildNodes() []*Node

GetImmediateChildNodes returns all immediate child nodes

func (*Node) HasChild ¶

func (n *Node) HasChild(c rune) bool

HasChild checks if node has a child with given character (including epsilon nodes)

func (*Node) HasEpsilonConnection ¶

func (n *Node) HasEpsilonConnection() bool

HasEpsilonConnection checks if node has epsilon connections

func (*Node) HasImmediateChild ¶

func (n *Node) HasImmediateChild(c rune) bool

HasImmediateChild checks if node has an immediate child with given character

func (*Node) Hash ¶

func (n *Node) Hash() int

Hash returns hash code for the node (based on index)

func (*Node) String ¶

func (n *Node) String() string

String returns string representation of the node

type NodeType ¶

type NodeType int

NodeType represents the type of node in the graph

const (
	TypeEmpty     NodeType = 0
	TypeWord      NodeType = 1
	TypeEnding    NodeType = 2
	TypeGraphRoot NodeType = 3
)

type Operation ¶

type Operation int

Operation represents the type of edit operation

const (
	NoError Operation = iota
	Insertion
	Deletion
	Substitution
	Transposition
	NA
)

type StemEndingGraph ¶

type StemEndingGraph struct {
	EndingGraph *CharacterGraph
	StemGraph   *CharacterGraph
	// contains filtered or unexported fields
}

StemEndingGraph creates a character graph from stems and endings

func NewStemEndingGraph ¶

func NewStemEndingGraph(stemWords []string, endingsPath string) (*StemEndingGraph, error)

NewStemEndingGraph creates a new stem-ending graph

func NewStemEndingGraphFromMorphology ¶

func NewStemEndingGraphFromMorphology(morph *morphology.TurkishMorphology, endingsPath string) (*StemEndingGraph, error)

NewStemEndingGraphFromMorphology creates a stem-ending graph from TurkishMorphology This matches Java's implementation: extracting stems from morphology

func (*StemEndingGraph) GetEndingGraph ¶

func (seg *StemEndingGraph) GetEndingGraph() *CharacterGraph

GetEndingGraph returns the ending graph

func (*StemEndingGraph) GetStemGraph ¶

func (seg *StemEndingGraph) GetStemGraph() *CharacterGraph

GetStemGraph returns the stem graph

type TurkishSentenceNormalizer ¶

type TurkishSentenceNormalizer struct {
	SpellChecker            *TurkishSpellChecker
	Replacements            map[string]string
	NoSplitWords            map[string]bool
	CommonSplits            map[string]string
	CommonConnectedSuffixes map[string]bool
	LookupManual            map[string][]string
	LookupFromGraph         map[string][]string
	LookupFromASCII         map[string][]string
	AlwaysApplyDeasciifier  bool
	// contains filtered or unexported fields
}

TurkishSentenceNormalizer normalizes informal Turkish sentences

func NewTurkishSentenceNormalizer ¶

func NewTurkishSentenceNormalizer(stemWords []string, resourcesPath string) (*TurkishSentenceNormalizer, error)

NewTurkishSentenceNormalizer creates a new sentence normalizer

func (*TurkishSentenceNormalizer) Normalize ¶

func (tsn *TurkishSentenceNormalizer) Normalize(sentence string) string

Normalize normalizes a sentence

func (*TurkishSentenceNormalizer) NormalizeWithBeamSearch ¶

func (tsn *TurkishSentenceNormalizer) NormalizeWithBeamSearch(sentence string) string

NormalizeWithBeamSearch normalizes sentence using beam search (simplified without LM)

type TurkishSentenceNormalizerAdvanced ¶

type TurkishSentenceNormalizerAdvanced struct {
	SpellChecker            *TurkishSpellChecker
	Morphology              *morphology.TurkishMorphology
	InformalMorphology      *morphology.TurkishMorphology
	AnalysisConverter       *analysis.InformalAnalysisConverter
	LanguageModel           lm.LanguageModel
	Replacements            map[string]string
	NoSplitWords            map[string]bool
	CommonSplits            map[string]string
	CommonConnectedSuffixes map[string]bool
	LookupManual            map[string][]string
	LookupFromGraph         map[string][]string
	LookupFromASCII         map[string][]string
	AlwaysApplyDeasciifier  bool
}

TurkishSentenceNormalizerAdvanced normalizes informal Turkish sentences with full morphology support

func NewTurkishSentenceNormalizerAdvanced ¶

func NewTurkishSentenceNormalizerAdvanced(morph *morphology.TurkishMorphology, dataRoot string) (*TurkishSentenceNormalizerAdvanced, error)

NewTurkishSentenceNormalizerAdvanced creates a new advanced sentence normalizer with morphology

func (*TurkishSentenceNormalizerAdvanced) Normalize ¶

func (tsn *TurkishSentenceNormalizerAdvanced) Normalize(sentence string) string

Normalize normalizes a sentence using full morphological analysis and beam search

type TurkishSentenceNormalizerEnhanced ¶

type TurkishSentenceNormalizerEnhanced struct {
	LookupManual   map[string][]string
	WordDictionary map[string]bool
	SpellChecker   *CharacterGraphDecoder
	Graph          *CharacterGraph
}

TurkishSentenceNormalizerEnhanced is an improved normalizer with lookup tables and dictionary support

func NewTurkishSentenceNormalizerEnhanced ¶

func NewTurkishSentenceNormalizerEnhanced() (*TurkishSentenceNormalizerEnhanced, error)

NewTurkishSentenceNormalizerEnhanced creates a new enhanced normalizer

func (*TurkishSentenceNormalizerEnhanced) Normalize ¶

func (tsne *TurkishSentenceNormalizerEnhanced) Normalize(sentence string) string

Normalize normalizes a Turkish sentence

type TurkishSentenceNormalizerWithLexicon ¶

type TurkishSentenceNormalizerWithLexicon struct {
	LookupManual   map[string][]string
	Lexicon        *lexicon.RootLexicon
	WordDictionary map[string]bool
	SpellChecker   *CharacterGraphDecoder
	Graph          *CharacterGraph
}

TurkishSentenceNormalizerWithLexicon uses full lexicon for normalization

func NewTurkishSentenceNormalizerWithLexicon ¶

func NewTurkishSentenceNormalizerWithLexicon() (*TurkishSentenceNormalizerWithLexicon, error)

NewTurkishSentenceNormalizerWithLexicon creates normalizer with full lexicon

func (*TurkishSentenceNormalizerWithLexicon) Normalize ¶

func (tsnl *TurkishSentenceNormalizerWithLexicon) Normalize(sentence string) string

Normalize normalizes a Turkish sentence

type TurkishSpellChecker ¶

type TurkishSpellChecker struct {
	Decoder     *CharacterGraphDecoder
	CharMatcher CharMatcher
	Morphology  interface{} // *morphology.TurkishMorphology

	LanguageModel lm.LanguageModel
	// contains filtered or unexported fields
}

TurkishSpellChecker provides spell checking and suggestion functionality

func NewTurkishSpellChecker ¶

func NewTurkishSpellChecker(stemWords []string, endingsPath string, matcher CharMatcher) (*TurkishSpellChecker, error)

NewTurkishSpellChecker creates a new spell checker

func (*TurkishSpellChecker) Check ¶

func (tsc *TurkishSpellChecker) Check(word string) bool

Check checks if word is spelled correctly

func (*TurkishSpellChecker) RankByFrequency ¶

func (tsc *TurkishSpellChecker) RankByFrequency(suggestions []string, frequencies map[string]int) []string

RankByFrequency ranks suggestions by frequency (requires frequency map)

func (*TurkishSpellChecker) SuggestForWord ¶

func (tsc *TurkishSpellChecker) SuggestForWord(word string) []string

SuggestForWord returns suggestions for a misspelled word

func (*TurkishSpellChecker) SuggestForWordForNormalization ¶

func (tsc *TurkishSpellChecker) SuggestForWordForNormalization(word string, leftContext string, rightContext string) []string

SuggestForWordForNormalization returns suggestions for normalization (alias)

func (*TurkishSpellChecker) SuggestForWordWithContext ¶

func (tsc *TurkishSpellChecker) SuggestForWordWithContext(word string, previous string, next string) []string

SuggestForWordWithContext returns suggestions with context awareness

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
deasciifier

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL