CitationGraphs

package module

v0.0.0-...-cc6e43b Latest Latest Go to latest Published: Apr 1, 2021 License: MIT Imports: 20 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/wujunfeng1/CitationGraphs

Links

Open Source Insights

README ¶

CitationGraphs.go

A go package for citation graphs.

Documentation ¶

Overview ¶

CitationGraphs.go project CitationGraphs.go

CitationGraphs.go document

Index ¶

func LoadMemberships(fileName string) map[int64][]float64
func SaveCitationGraph(path string, prefix string, citationGraph *CitationGraph)
func SaveMemberships(memberships map[int64][]float64, fileName string)
func TidyTitle(title string) string
type CitationGraph
- func LoadCitationGraph(path string, prefix string) *CitationGraph
- func (g *CitationGraph) ClusterByLDA(numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterLabelsByGSDMM(numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterLabelsByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
- func (g *CitationGraph) ClusterTitlesByGSDMM(simType, numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterTitlesByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
- func (g *CitationGraph) CompareByAMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByARI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByCPM(gamma float64, communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByModularity(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByNMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByRI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) ComputeEMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) ComputeEntropies(communities []map[uint]bool, memberships map[int64][]float64) (float64, float64, float64)
- func (g *CitationGraph) CreateCorpus(corpusType int) *Corpus
- func (g *CitationGraph) CreateCorpusSeq(corpusType int) (corpus *CorpusSeq, years []int, nodeIDs []int64, isEnglish []bool)
- func (g *CitationGraph) CreateCorpusX(corpusType int) *CorpusX
- func (g *CitationGraph) GetCommunitiesFromMemberships(memberships map[int64][]float64) []map[uint]bool
- func (g *CitationGraph) GetEmergingTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
- func (g *CitationGraph) GetEmergingTrends(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[string][]int
- func (g *CitationGraph) GetHotTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
- func (g *CitationGraph) GetIdxMainNode(nodeID int64) uint
- func (g *CitationGraph) GetPhraseSimilarity(simType int) map[string]map[string]float64
- func (g *CitationGraph) GetPhraseSimilarityX(simType int) map[string]map[string]float64
- func (g *CitationGraph) GetStronglyConnectedPhrases(thresFreq, thresRatio float64) map[PhrasePair]PairFreq
- func (g *CitationGraph) Leap2Trend(fileNamePrefix string, yearStartFrom, yearEndWith, minFreq, minJump int)
- func (g *CitationGraph) SaveWord2VecTrainingData(fileNamePrefix string, numIters, minFreq int, minScore float64, useW2PEx bool, ...)
- func (g *CitationGraph) SimTFIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
- func (g *CitationGraph) SimTFSimIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
- func (g *CitationGraph) SortByYear() map[int][]string
- func (g *CitationGraph) TFIDF() []map[string]float64
- func (g *CitationGraph) Word2Vec(fileNamePrefix string, numIters, minFreq int, minScore float64, useW2PEx bool, ...)
type CitationNode
type Corpus
- func NewCorpus() *Corpus
- func (this *Corpus) AddDoc(words []string)
- func (this *Corpus) GetConcurrences() map[uint]map[uint]float64
type CorpusSeq
- func NewCorpusSeq() *CorpusSeq
- func (this *CorpusSeq) AddDoc(words []string)
- func (this *CorpusSeq) GetConcurrences() map[uint]map[uint]float64
- func (this *CorpusSeq) Word2Phrase(numIters, minFreq int, minScore float64) *CorpusSeq
- func (this *CorpusSeq) Word2PhraseEx(numIters, minFreq int, minScore float64) *CorpusSeq
type CorpusX
- func NewCorpusX() *CorpusX
- func (this *CorpusX) AddDoc(words [][]string)
- func (this *CorpusX) GetConcurrences() map[uint]map[uint]float64
- func (this *CorpusX) GetDocConcurrences() map[uint]map[uint]float64
- func (this *CorpusX) GetExclusions() map[uint]map[uint]bool
type DocWord
type GSDMM
- func NewGSDMM(numTopics int, alpha float64, beta float64, data *Corpus) *GSDMM
- func (this *GSDMM) ComputeEntropy() float64
- func (this *GSDMM) ComputeRelativeEntropy() float64
- func (this *GSDMM) Infer(wordCounts map[int]int) []float64
- func (this *GSDMM) Init()
- func (this *GSDMM) ResampleTopics(numIters int)
- func (this *GSDMM) Train(numIters int)
type LDA
- func NewLDA(numTopics int, alpha float64, beta float64, data *Corpus) *LDA
- func (this *LDA) ComputeEntropy() float64
- func (this *LDA) ComputeRelativeEntropy() float64
- func (this *LDA) Infer(wordCounts map[int]int) []float64
- func (this *LDA) Init()
- func (this *LDA) ResampleTopics(numIters int)
- func (this *LDA) Train(numIters int)
type PairFreq
type PhrasePair
type RankJump
type TopicModel
type WPDM

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func LoadMemberships ¶

func LoadMemberships(fileName string) map[int64][]float64

================================================================================================= func LoadMemberships

func SaveCitationGraph ¶

func SaveCitationGraph(path string, prefix string, citationGraph *CitationGraph)

=========================================================================================== function SaveCitationGraph brief description: Save data to three files (nodes, edges, labels) of a citation graph. input:

path: The name of the path where the three files are stored.
prefix: The prefix of the names of the three files. For example, the prefix of
        ijcai-citation-graph-nodes.csv is ijcai.
citationGraph: The citation graph represented by the three files.

output:

nothing

func SaveMemberships ¶

func SaveMemberships(memberships map[int64][]float64, fileName string)

================================================================================================= func SaveMemberships

func TidyTitle ¶

func TidyTitle(title string) string

================================================================================================= function TidyTitle brief description: Make a title more tidy before using it. The procedure include the

steps:
(1) remove the spaces at the head and the tail of the title,
(2) replace "&lt;" with "<",
(3) replace "&gt;" with ">",
(4) replace "&amp;" with "&",
(5) replace "&quot;" with "\"",
(6) replace "&apos;" with "'",
(7) replace "&//[number];" with a corresponding unicode

input:

title: The String (text) of a title.

output:

A string of the tidied-up title.

Types ¶

type CitationGraph ¶

type CitationGraph struct {
	Nodes        map[int64]*CitationNode
	ToBeAnalyzed []int64
	// contains filtered or unexported fields
}

================================================================================================= struct CitationGraph brief description: A data structure of citation graph, the data of which are collected from

the website https://academic.microsoft.com , which is the public service
website of MAG(Microsoft Academic Graph)

fields:

nodes: The node of struct CitationNode stored in a dictionary with key = id
toBeAnalyzed: The list of nodes to be analyzed. In ijcai_clustering dataset, these nodes
              are those published in IJCAI.

func LoadCitationGraph ¶

func LoadCitationGraph(path string, prefix string) *CitationGraph

================================================================================================= function LoadCitationGraph brief description:

Load data from three files (nodes, edges, labels) of a citation graph.

input:

path: The name of the path where the three files are stored.
prefix: The prefix of the names of the three files. For example, the prefix of
        ijcai-citation-graph-nodes.csv is ijcai.

output:

The citation graph represented by the three files.

func (*CitationGraph) ClusterByLDA ¶

func (g *CitationGraph) ClusterByLDA(numTopics int, alpha, beta float64, numIters int,
) map[int64][]float64

================================================================================================= func (g *CitationGraph) ClusterByLDA brief description: cluster the main nodes of g with their titles and their reference titles using

using LDA.

input:

numTopics: number of topics
alpha, beta: the parameters of LDA
numIters: number of iterations for LDA

output:

for each main node, gives the likelihoods the node belonging to a cluster.

func (*CitationGraph) ClusterLabelsByGSDMM ¶

func (g *CitationGraph) ClusterLabelsByGSDMM(numTopics int, alpha, beta float64, numIters int,
) map[int64][]float64

}

func (*CitationGraph) ClusterLabelsByWPDM ¶

func (g *CitationGraph) ClusterLabelsByWPDM(eps float64, minPts uint, simType int,
	workSpaceFileName string) []map[uint]bool

================================================================================================= func (g *CitationGraph) ClusterLabelsByWPDM brief description: cluster the main nodes of g with their titles and their reference titles using

using LDA.

input:

eps: the radius of neighborhood.
minPts: Only if the neighborhood of a point contains at least minPt points
	(the center point of the neighborhood included), the neighborhood is
	called dense. Only dense neighborhoods are connected to communities.
simType: the type of similarity, 0 for simple induced similarity, 1 for normalized
	similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for
	normalized jaccard similarity
workSpaceFileName: a file name for intermediate result

output:

for each main node, gives the likelihoods the node belonging to a cluster.

func (*CitationGraph) ClusterTitlesByGSDMM ¶

func (g *CitationGraph) ClusterTitlesByGSDMM(simType, numTopics int, alpha, beta float64, numIters int,
) map[int64][]float64

================================================================================================= func (g *CitationGraph) ClusterTitlesByGSDMM brief description: cluster the main nodes of g with their titles and their reference titles using

using LDA.

input:

simType: the type of similarity, 0 for simple induced similarity, 1 for normalized
	similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for
	normalized jaccard similarity
numTopics: number of topics
alpha, beta: the parameters of GSDMM
numIters: number of iterations

output:

for each main node, gives the likelihoods the node belonging to a cluster.

func (*CitationGraph) ClusterTitlesByWPDM ¶

func (g *CitationGraph) ClusterTitlesByWPDM(eps float64, minPts uint, simType int,
	workSpaceFileName string) []map[uint]bool

================================================================================================= func (g *CitationGraph) ClusterTitlesByWPDM brief description: cluster the main nodes of g with their titles and their reference titles using

using LDA.

input:

eps: the radius of neighborhood.
minPts: Only if the neighborhood of a point contains at least minPt points
	(the center point of the neighborhood included), the neighborhood is
	called dense. Only dense neighborhoods are connected to communities.
simType: the type of similarity, 0 for simple induced similarity, 1 for normalized
	similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for
	normalized jaccard similarity
workSpaceFileName: a file name for intermediate result

output:

for each main node, gives the likelihoods the node belonging to a cluster.

func (*CitationGraph) CompareByAMI ¶

func (g *CitationGraph) CompareByAMI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByAMI

func (*CitationGraph) CompareByARI ¶

func (g *CitationGraph) CompareByARI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByARI

func (*CitationGraph) CompareByCPM ¶

func (g *CitationGraph) CompareByCPM(gamma float64, communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByCPM

func (*CitationGraph) CompareByMI ¶

func (g *CitationGraph) CompareByMI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByMI

func (*CitationGraph) CompareByModularity ¶

func (g *CitationGraph) CompareByModularity(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByModularity

func (*CitationGraph) CompareByNMI ¶

func (g *CitationGraph) CompareByNMI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByNMI

func (*CitationGraph) CompareByRI ¶

func (g *CitationGraph) CompareByRI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) CompareByRI

func (*CitationGraph) ComputeEMI ¶

func (g *CitationGraph) ComputeEMI(communities []map[uint]bool,
	memberships map[int64][]float64) float64

================================================================================================= func (g *CitationGraph) ComputeEMI

func (*CitationGraph) ComputeEntropies ¶

func (g *CitationGraph) ComputeEntropies(communities []map[uint]bool,
	memberships map[int64][]float64) (float64, float64, float64)

================================================================================================= func (g *CitationGraph) ComputeEntropies

func (*CitationGraph) CreateCorpus ¶

func (g *CitationGraph) CreateCorpus(corpusType int) *Corpus

================================================================================================= func (g *CitationGraph) CreateCorpus brief description: create a corpus from a citation graph input

corpusType:
	0 for title + ref titles per document for main nodes,
	1 for title per document for main nodes,
	2 for title per document for all nodes,
	3 for labels per document for main nodes,

func (*CitationGraph) CreateCorpusSeq ¶

func (g *CitationGraph) CreateCorpusSeq(corpusType int) (corpus *CorpusSeq, years []int,
	nodeIDs []int64, isEnglish []bool)

================================================================================================= func (g *CitationGraph) CreateCorpusSeq brief description: create a corpus from a citation graph input

corpusType:
	0 for title + ref titles per document for main nodes,
	1 for title per document for main nodes,
	2 for title per document for all nodes,
	3 for labels per document for main nodes,

func (*CitationGraph) CreateCorpusX ¶

func (g *CitationGraph) CreateCorpusX(corpusType int) *CorpusX

================================================================================================= func (g *CitationGraph) CreateCorpusX brief description: create a corpusX from a citation graph input

corpusType:
	0 for title + ref titles per document for main nodes,
	1 for title per document for main nodes,
	2 for title per document for all nodes,

func (*CitationGraph) GetCommunitiesFromMemberships ¶

func (g *CitationGraph) GetCommunitiesFromMemberships(memberships map[int64][]float64) []map[uint]bool

================================================================================================= func (g *CitationGraph) getCommunitiesFromMemberships

func (*CitationGraph) GetEmergingTopicPublications ¶

func (g *CitationGraph) GetEmergingTopicPublications(yearToday, yearRecent, yearFarAway,
	lowThreshold, highThreshold int) map[int64][]int

================================================================================================= func (g *CitationGraph) GetEmergingTopicPublications note: The publications of emerging topic have the following characteristics:

(1) cold start: in the year of publication, it receives citations <= lowThreshold per year
(2) break out: in recent years, it receives avg citations >= highThreshold per year

func (*CitationGraph) GetEmergingTrends ¶

func (g *CitationGraph) GetEmergingTrends(yearToday, yearRecent, yearFarAway, lowThreshold,
	highThreshold int) map[string][]int

================================================================================================= func (g *CitationGraph) GetEmergingTrends

func (*CitationGraph) GetHotTopicPublications ¶

func (g *CitationGraph) GetHotTopicPublications(yearToday, yearRecent, yearFarAway,
	lowThreshold, highThreshold int) map[int64][]int

================================================================================================= func (g *CitationGraph) GetHotTopicPublications note: The publications of hot topic have the following characteristics:

hot start: in the year of publication or next year, it receives citations >= highThreshold per year

func (*CitationGraph) GetIdxMainNode ¶

func (g *CitationGraph) GetIdxMainNode(nodeID int64) uint

================================================================================================= func (g *CitationGraph) GetIdxMainNode

func (*CitationGraph) GetPhraseSimilarity ¶

func (g *CitationGraph) GetPhraseSimilarity(simType int) map[string]map[string]float64

================================================================================================= method GetPhraseSimilarity brief description: compute the similarities between pairs of phrases and gives them in form of a

sparse matrix using Title Link method

input:

nothing

output:

the sparse matrix of phrase similarities

note:

The title link method can be found in:
Bogomolova, A., Ryazanova, M., & Balk, I. (2021). Cluster approach to analysis of publication
titles. In Journal of Physics: Conference Series (Vol. 1727, No. 1, p. 012016). IOP Publishing.

func (*CitationGraph) GetPhraseSimilarityX ¶

func (g *CitationGraph) GetPhraseSimilarityX(simType int) map[string]map[string]float64

================================================================================================= method GetPhraseSimilarityX brief description: compute the similarities between pairs of phrases and gives them in form of a

sparse matrix using Title Link method

input:

nothing

output:

the sparse matrix of phrase similarities

note:

The title link method can be found in:
Bogomolova, A., Ryazanova, M., & Balk, I. (2021). Cluster approach to analysis of publication
titles. In Journal of Physics: Conference Series (Vol. 1727, No. 1, p. 012016). IOP Publishing.

func (*CitationGraph) GetStronglyConnectedPhrases ¶

func (g *CitationGraph) GetStronglyConnectedPhrases(thresFreq, thresRatio float64) map[PhrasePair]PairFreq

================================================================================================= func (g *CitationGraph) GetStronglyConnectedPhrases

func (*CitationGraph) Leap2Trend ¶

func (g *CitationGraph) Leap2Trend(fileNamePrefix string, yearStartFrom, yearEndWith, minFreq, minJump int)

func (*CitationGraph) SaveWord2VecTrainingData ¶

func (g *CitationGraph) SaveWord2VecTrainingData(fileNamePrefix string, numIters, minFreq int,
	minScore float64, useW2PEx bool, yearStartFrom int)

================================================================================================= func (g *CitationGraph) SaveWord2VecTrainingData brief description:

save tokenized and stemmed publication titles to a file for training of word2vec

func (*CitationGraph) SimTFIDF ¶

func (g *CitationGraph) SimTFIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64

================================================================================================= method SimTFIDF brief description: compute the Fuzzy TFIDF of possible key phrases for each main node of a citationGraph input:

phraseSimilarity: similarities of phrases

output:

the result of Fuzzy TFIDF grouped by the main nodes of the CitationGraph

func (*CitationGraph) SimTFSimIDF ¶

func (g *CitationGraph) SimTFSimIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64

================================================================================================= method SimTFSimIDF brief description: compute the Fuzzy TFIDF of possible key phrases for each main node of a citationGraph input:

nothing

output:

the result of Fuzzy TFIDF grouped by the main nodes of the CitationGraph

func (*CitationGraph) SortByYear ¶

func (g *CitationGraph) SortByYear() map[int][]string

================================================================================================= func (g *CitationGraph) SortByYear brief description: return a sorted list of titles input:

None

output:

A map with year as key and slice of titles as value.

func (*CitationGraph) TFIDF ¶

func (g *CitationGraph) TFIDF() []map[string]float64

================================================================================================= method TFIDF brief description: compute the TFIDF of possible key phrases for each main node of a citationGraph input:

nothing

output:

the result of TFIDF grouped by the main nodes of the CitationGraph

func (*CitationGraph) Word2Vec ¶

func (g *CitationGraph) Word2Vec(fileNamePrefix string, numIters, minFreq int,
	minScore float64, useW2PEx bool, yearStartFrom, yearEndWith int)

type CitationNode ¶

type CitationNode struct {
	ID     int64
	Year   int64
	Title  string
	Labels []string
	Refs   []int64
	Cites  []int64
}

================================================================================================= struct CitationNode brief description: The node structure of a citation graph, the data of which are collected

from crawling the website https://academic.microsoft.com , which is the
public service website of MAG(Microsoft Academic Graph)

fields:

id: The MAG(Microsoft Academic Graph) id of the paper at this node.
    We can access the detail of the paper with the id by navigating the web link:
    https://academic.microsoft.com/paper/$id
year: The year of publication of this paper.
title: The title of the paper.
labels: The labels from MAG(Microsoft Academic Graph).
refs: The references of this paper collected from MAG.
      Please note that this field could be inaccurate: MAG suffers delay of info.
      Many new papers with refs don't have refs listed in MAG.
cites: The citations to this paper from other papers (also collected from MAG).
      Please note that this field is as inaccurate as refs due to the same reason.

type Corpus ¶

type Corpus struct {
	Vocab map[string]int // the vocabulary
	Docs  []map[int]int  // keys: docID, wordID, value: word count
}

================================================================================================= struct Corpus brief description: the corpus data structure

func NewCorpus ¶

func NewCorpus() *Corpus

================================================================================================= func NewCorpus brief description: create an empty corpus

func (*Corpus) AddDoc ¶

func (this *Corpus) AddDoc(words []string)

================================================================================================= func (this *Corpus) AddDoc brief description: add one document to corpus with specified docId and word count list, if the

specified docId already exists in corpus, the old doc will be overwritted

func (*Corpus) GetConcurrences ¶

func (this *Corpus) GetConcurrences() map[uint]map[uint]float64

================================================================================================= func (this *Corpus) GetConcurrences brief description: get concurrences from corpus

type CorpusSeq ¶

type CorpusSeq struct {
	Vocab map[string]int // the vocabulary
	Docs  [][]int        // keys: docID, wordIdx, value: word id
}

================================================================================================= struct CorpusSeq brief description: the extended corpus data structure

func NewCorpusSeq ¶

func NewCorpusSeq() *CorpusSeq

================================================================================================= func NewCorpusSeq brief description: create an empty corpus

func (*CorpusSeq) AddDoc ¶

func (this *CorpusSeq) AddDoc(words []string)

================================================================================================= func (this *CorpusSeq) AddDoc brief description: add one document to corpus with specified docId and word count list, if the

specified docId already exists in corpus, the old doc will be overwritted

func (*CorpusSeq) GetConcurrences ¶

func (this *CorpusSeq) GetConcurrences() map[uint]map[uint]float64

================================================================================================= func (this *CorpusSeq) GetConcurrences brief description: get concurrences from corpus

func (*CorpusSeq) Word2Phrase ¶

func (this *CorpusSeq) Word2Phrase(numIters, minFreq int, minScore float64) *CorpusSeq

================================================================================================= func (this *CorpusSeq) Word2Phrase

func (*CorpusSeq) Word2PhraseEx ¶

func (this *CorpusSeq) Word2PhraseEx(numIters, minFreq int, minScore float64) *CorpusSeq

================================================================================================= func (this *CorpusSeq) Word2PhraseEx

type CorpusX ¶

type CorpusX struct {
	Vocab map[string]int  // the vocabulary
	Docs  [][]map[int]int // keys: docID, wordID, value: word count
}

================================================================================================= struct CorpusX brief description: the extended corpus data structure

func NewCorpusX ¶

func NewCorpusX() *CorpusX

================================================================================================= func NewCorpusX brief description: create an empty corpus

func (*CorpusX) AddDoc ¶

func (this *CorpusX) AddDoc(words [][]string)

================================================================================================= func (this *CorpusX) AddDoc brief description: add one document to corpus with specified docId and word count list, if the

specified docId already exists in corpus, the old doc will be overwritted

func (*CorpusX) GetConcurrences ¶

func (this *CorpusX) GetConcurrences() map[uint]map[uint]float64

================================================================================================= func (this *CorpusX) GetConcurrences brief description: get concurrences from corpus

func (*CorpusX) GetDocConcurrences ¶

func (this *CorpusX) GetDocConcurrences() map[uint]map[uint]float64

================================================================================================= func (this *CorpusX) GetConcurrences brief description: get concurrences from corpus

func (*CorpusX) GetExclusions ¶

func (this *CorpusX) GetExclusions() map[uint]map[uint]bool

================================================================================================= func (this *CorpusX) GetExclusions brief description: get exclusions from corpus

type DocWord ¶

type DocWord struct {
	DocId  int
	WordId int
}

================================================================================================= struct DocWord

type GSDMM ¶

type GSDMM struct {
	Alpha     float64 // document topic mixture hyperparameter
	Beta      float64 // topic word mixture hyperparameter
	NumTopics int     // number of topics

	Data          *Corpus // the input corpus
	NumWordsInDoc []int   // the number of words in documents

	DocTopic          []int   // doc-topic count table
	TopicWordCount    [][]int // word-topic count table
	TopicWordCountSum []int   // word-topic-sum count table
	TopicDocCount     []int   // topic-doc-sum count table
}

================================================================================================= struct GSDMM brief description: the data structure of GSDMM model

func NewGSDMM ¶

func NewGSDMM(numTopics int, alpha float64, beta float64, data *Corpus) *GSDMM

================================================================================================= func NewGSDMM brief description: create an LDA instance with collapsed gibbs sampler

func (*GSDMM) ComputeEntropy ¶

func (this *GSDMM) ComputeEntropy() float64

================================================================================================= func (this *GSDMM) ComputeEntropy brief description: compute entropy

func (*GSDMM) ComputeRelativeEntropy ¶

func (this *GSDMM) ComputeRelativeEntropy() float64

================================================================================================= func (this *GSDMM) ComputeEntropy brief description: compute entropy

func (*GSDMM) Infer ¶

func (this *GSDMM) Infer(wordCounts map[int]int) []float64

================================================================================================= func (this *GSDMM) Infer brief description: infer topics on new documents

func (*GSDMM) Init ¶

func (this *GSDMM) Init()

================================================================================================= func (this *GSDMM) Init

func (*GSDMM) ResampleTopics ¶

func (this *GSDMM) ResampleTopics(numIters int)

================================================================================================= func (this *GSDMM) ResampleTopics

func (*GSDMM) Train ¶

func (this *GSDMM) Train(numIters int)

================================================================================================= func (this *GSDMM) Train brief description: train model

type LDA ¶

type LDA struct {
	Alpha     float64 // document topic mixture hyperparameter
	Beta      float64 // topic word mixture hyperparameter
	NumTopics int     // number of topics

	Data *Corpus // the input corpus

	WordTopicCount [][]int           // word-topic count table
	DocTopicCount  [][]int           // doc-topic count table
	TopicCountSum  []int             // word-topic-sum count table
	DocWordToTopic map[DocWord][]int // doc-word-topic count table
}

================================================================================================= struct LDA brief description: the data structure of LDA model with Collapsed Gibbs Sampler note:

The fast collapsed gibbs sampler algorithm can be found in reference:
Porteous, I., Newman, D., Ihler, A., Asuncion, A., Smyth, P., & Welling, M. (2008, August). Fast
collapsed gibbs sampling for latent dirichlet allocation. In Proceedings of the 14th ACM SIGKDD
international conference on Knowledge discovery and data mining (pp. 569-577).

func NewLDA ¶

func NewLDA(numTopics int, alpha float64, beta float64, data *Corpus) *LDA

================================================================================================= func NewLDA brief description: create an LDA instance with collapsed gibbs sampler

func (*LDA) ComputeEntropy ¶

func (this *LDA) ComputeEntropy() float64

================================================================================================= func (this *LDA) ComputeEntropy brief description: compute entropy

func (*LDA) ComputeRelativeEntropy ¶

func (this *LDA) ComputeRelativeEntropy() float64

================================================================================================= func (this *LDA) ComputeEntropy brief description: compute entropy

func (*LDA) Infer ¶

func (this *LDA) Infer(wordCounts map[int]int) []float64

================================================================================================= func (this *LDA) Infer brief description: infer topics on new documents

func (*LDA) Init ¶

func (this *LDA) Init()

================================================================================================= func (this *LDA) Init

func (*LDA) ResampleTopics ¶

func (this *LDA) ResampleTopics(numIters int)

================================================================================================= func (this *LDA) ResampleTopics

func (*LDA) Train ¶

func (this *LDA) Train(numIters int)

================================================================================================= func (this *LDA) Train brief description: train model

type PairFreq ¶

type PairFreq struct {
	Actual   float64
	Expected float64
}

type PhrasePair ¶

type PhrasePair struct {
	Phrase1, Phrase2 string
}

================================================================================================= struct PhrasePair

type RankJump ¶

type RankJump struct {
	// contains filtered or unexported fields
}

type TopicModel ¶

type TopicModel interface {
	// train model for iter iteration
	Train(numIters int)
	// do inference for new doc with its wordCounts
	Infer(doc map[int]int) []float64
	// compute entropy
	ComputeEntropy() float64
	// compute relative entropy
	ComputeRelativeEntropy() float64
}

================================================================================================= interface LDAModel brief description: the common interface of LDA models

type WPDM ¶

type WPDM struct {
	Data      *Corpus // the input corpus
	DocTopic  []int   // doc-topic table
	NumTopics int     // number of topics
	// contains filtered or unexported fields
}

================================================================================================= struct WPDM brief description: the data structure of WPDM model

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL