analysis

package

v0.0.80 Latest Latest Go to latest Published: Nov 19, 2023 License: Apache-2.0 Imports: 22 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/alexamies/cnreader

Links

Open Source Insights

Documentation ¶

Overview ¶

Package for vocabulary analysis of a monolingual Chinese text corpus

This includes - reading the corpus documents from disk - tokenization of the corpus into multi-character arrays - computation of term and bigram frequencies - compilation of an index for later full text search - computation of term occurrence and usage in the corpus

Index ¶

func GetDocFrequencies(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, ...) (*index.DocumentFrequency, error)
func Subtract(headwords, subtract []dicttypes.Word) []dicttypes.Word
func WriteCorpus(collections []corpus.CollectionEntry, outputConfig generator.HTMLOutPutConfig, ...) (*index.IndexState, error)
func WriteCorpusAll(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, ...) (*index.IndexState, error)
func WriteCorpusCol(collectionFile string, libLoader library.LibraryLoader, ...) error
func WriteHwFiles(dep HWFileDependencies) error
func WriteLibraryFile(lib library.Library, corpora []library.CorpusData, outputFile string, ...)
type CollectionAResults
- func NewCollectionAResults() CollectionAResults
- func ParseText(text string, colTitle string, document *corpus.CorpusEntry, ...) (list.List, *CollectionAResults)
type DictEntry
type Glossary
- func MakeGlossary(domain string, headwords []dicttypes.Word) Glossary
type HWFileDependencies
type HeadwordWriter
type VocabAnalysis
- func GetWordFrequencies(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, ...) (*VocabAnalysis, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func GetDocFrequencies ¶

func GetDocFrequencies(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer,
	dict *dictionary.Dictionary) (*index.DocumentFrequency, error)

getWordFrequencies compute word doc frequencies for corpus

func Subtract ¶

func Subtract(headwords, subtract []dicttypes.Word) []dicttypes.Word

Subtract the items in the second list from the first

func WriteCorpus ¶

func WriteCorpus(collections []corpus.CollectionEntry,
	outputConfig generator.HTMLOutPutConfig,
	libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer,
	indexConfig index.IndexConfig, dict *dictionary.Dictionary,
	c config.AppConfig, corpusConfig corpus.CorpusConfig, bibClient bibnotes.BibNotesClient) (*index.IndexState, error)

WriteCorpus write all the collections in the given corpus collections: The set of collections to write to HTML baseDir: The base directory to use to write the files

func WriteCorpusAll ¶

func WriteCorpusAll(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig,
	indexConfig index.IndexConfig, dict *dictionary.Dictionary,
	c config.AppConfig, bibClient bibnotes.BibNotesClient) (*index.IndexState, error)

WriteCorpusAll write all the collections in the default corpus (collections.csv file)

func WriteCorpusCol ¶

func WriteCorpusCol(collectionFile string, libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig,
	corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary,
	c config.AppConfig, bibClient bibnotes.BibNotesClient) error

WriteCorpusCol writes a corpus document collection to HTML, including all the entries contained in the collection collectionFile: the name of the collection file

func WriteHwFiles ¶

func WriteHwFiles(dep HWFileDependencies) error

Writes dictionary headword entries func WriteHwFiles(loader library.LibraryLoader,

dictTokenizer tokenizer.Tokenizer,
outputConfig generator.HTMLOutPutConfig,
indexState index.IndexState,
wdict map[string]dicttypes.Word,
vocabAnalysis VocabAnalysis,
hww HeadwordWriter) error {

func WriteLibraryFile ¶

func WriteLibraryFile(lib library.Library, corpora []library.CorpusData,
	outputFile string, outputConfig generator.HTMLOutPutConfig)

WriteLibraryFile writes a HTML files describing the corpora in the library.

This is for both public and for the translation portal (requiring login).

Types ¶

type CollectionAResults ¶

type CollectionAResults struct {
	Vocab             map[string]int
	Bigrams           map[string]int
	Usage             map[string]string
	BigramFrequencies ngram.BigramFreqMap
	Collocations      ngram.CollocationMap
	WC, CCount        int
	UnknownChars      map[string]int
	WFDocMap          index.TermFreqDocMap
	BigramDocMap      index.TermFreqDocMap
	DocFreq           index.DocumentFrequency
	BigramDF          index.DocumentFrequency
	DocLengthArray    []index.DocLength
}

A struct to hold the analysis results for the collection

func NewCollectionAResults ¶

func NewCollectionAResults() CollectionAResults

Constructor for empty CollectionAResults

func ParseText ¶

func ParseText(text string, colTitle string, document *corpus.CorpusEntry, dictTokenizer tokenizer.Tokenizer, corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary) (list.List, *CollectionAResults)

ParseText tokenizes a Chinese text corpus document into terms Parameters:

text: the string to parse
ColTitle: Optional parameter used for tracing collocation usage
document: Optional parameter used for tracing collocation usage

Returns:

tokens: the tokens for the parsed text
results: vocabulary analysis results

func (*CollectionAResults) AddResults ¶

func (results *CollectionAResults) AddResults(more *CollectionAResults)

Add more results to this set of results

func (*CollectionAResults) GetHeadwords ¶

func (results *CollectionAResults) GetHeadwords(wdict map[string]*dicttypes.Word) []dicttypes.Word

Returns the subset of words that are lexical (content) words

func (*CollectionAResults) GetLexicalWordFreq ¶

func (results *CollectionAResults) GetLexicalWordFreq(sortedWords []index.SortedWordItem,
	wdict map[string]*dicttypes.Word) []wFResult

Returns the subset of words that are lexical (content) words

func (*CollectionAResults) GetWordFreq ¶

func (results *CollectionAResults) GetWordFreq(sortedWords []index.SortedWordItem,
	wdict map[string]*dicttypes.Word) []wFResult

Returns the subset of words that are lexical (content) words

type DictEntry ¶

type DictEntry struct {
	Title            string
	Headword         dicttypes.Word
	RelevantDocs     []index.RetrievalResult
	ContainsByDomain []dicttypes.Word
	Contains         []dicttypes.Word
	Collocations     []ngram.BigramFreq
	UsageArr         []wordUsage
	DateUpdated      string
}

DictEntry holds content used for writing a dictionary entry to HTML

type Glossary ¶

type Glossary struct {
	Domain string
	Words  dicttypes.Words
}

The content for a corpus entry

func MakeGlossary ¶

func MakeGlossary(domain string, headwords []dicttypes.Word) Glossary

Makes a glossary by filtering by the domain label and sorting by Chinese pinyin.

type HWFileDependencies ¶ added in v0.0.41

type HWFileDependencies struct {
	Loader         library.LibraryLoader
	DictTokenizer  tokenizer.Tokenizer
	OutputConfig   generator.HTMLOutPutConfig
	IndexState     index.IndexState
	Dict           *dictionary.Dictionary
	VocabAnalysis  VocabAnalysis
	Hww            HeadwordWriter
	BibNotesClient bibnotes.BibNotesClient
}

type HeadwordWriter ¶ added in v0.0.31

type HeadwordWriter interface {
	NewWriter(hwId int) io.Writer
	CloseWriter(hwId int)
}

hwWriter manages files for writing headwords to HTML

type VocabAnalysis ¶

type VocabAnalysis struct {
	UsageMap     map[string]*[]wordUsage
	WFTotal      map[*index.CorpusWord]index.CorpusWordFreq
	WCTotal      map[string]int
	Collocations ngram.CollocationMap
}

VocabAnalysis bundles up vocabulary analysis

func GetWordFrequencies ¶

func GetWordFrequencies(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer,
	dict *dictionary.Dictionary) (*VocabAnalysis, error)

getWordFrequencies compute word frequencies, collocations, and usage for corpus

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL