Documentation ¶
Overview ¶
CitationGraphs.go project CitationGraphs.go
CitationGraphs.go document
Index ¶
- func LoadMemberships(fileName string) map[int64][]float64
- func SaveCitationGraph(path string, prefix string, citationGraph *CitationGraph)
- func SaveMemberships(memberships map[int64][]float64, fileName string)
- func TidyTitle(title string) string
- type CitationGraph
- func (g *CitationGraph) ClusterByLDA(numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterLabelsByGSDMM(numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterLabelsByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
- func (g *CitationGraph) ClusterTitlesByGSDMM(simType, numTopics int, alpha, beta float64, numIters int) map[int64][]float64
- func (g *CitationGraph) ClusterTitlesByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
- func (g *CitationGraph) CompareByAMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByARI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByCPM(gamma float64, communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByModularity(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByNMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) CompareByRI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) ComputeEMI(communities []map[uint]bool, memberships map[int64][]float64) float64
- func (g *CitationGraph) ComputeEntropies(communities []map[uint]bool, memberships map[int64][]float64) (float64, float64, float64)
- func (g *CitationGraph) CreateCorpus(corpusType int) *Corpus
- func (g *CitationGraph) CreateCorpusSeq(corpusType int) (corpus *CorpusSeq, years []int, nodeIDs []int64, isEnglish []bool)
- func (g *CitationGraph) CreateCorpusX(corpusType int) *CorpusX
- func (g *CitationGraph) GetCommunitiesFromMemberships(memberships map[int64][]float64) []map[uint]bool
- func (g *CitationGraph) GetEmergingTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
- func (g *CitationGraph) GetEmergingTrends(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[string][]int
- func (g *CitationGraph) GetHotTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
- func (g *CitationGraph) GetIdxMainNode(nodeID int64) uint
- func (g *CitationGraph) GetPhraseSimilarity(simType int) map[string]map[string]float64
- func (g *CitationGraph) GetPhraseSimilarityX(simType int) map[string]map[string]float64
- func (g *CitationGraph) GetStronglyConnectedPhrases(thresFreq, thresRatio float64) map[PhrasePair]PairFreq
- func (g *CitationGraph) Leap2Trend(fileNamePrefix string, yearStartFrom, yearEndWith, minFreq, minJump int)
- func (g *CitationGraph) SaveWord2VecTrainingData(fileNamePrefix string, numIters, minFreq int, minScore float64, useW2PEx bool, ...)
- func (g *CitationGraph) SimTFIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
- func (g *CitationGraph) SimTFSimIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
- func (g *CitationGraph) SortByYear() map[int][]string
- func (g *CitationGraph) TFIDF() []map[string]float64
- func (g *CitationGraph) Word2Vec(fileNamePrefix string, numIters, minFreq int, minScore float64, useW2PEx bool, ...)
- type CitationNode
- type Corpus
- type CorpusSeq
- type CorpusX
- type DocWord
- type GSDMM
- type LDA
- type PairFreq
- type PhrasePair
- type RankJump
- type TopicModel
- type WPDM
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func LoadMemberships ¶
================================================================================================= func LoadMemberships
func SaveCitationGraph ¶
func SaveCitationGraph(path string, prefix string, citationGraph *CitationGraph)
=========================================================================================== function SaveCitationGraph brief description: Save data to three files (nodes, edges, labels) of a citation graph. input:
path: The name of the path where the three files are stored. prefix: The prefix of the names of the three files. For example, the prefix of ijcai-citation-graph-nodes.csv is ijcai. citationGraph: The citation graph represented by the three files.
output:
nothing
func SaveMemberships ¶
================================================================================================= func SaveMemberships
func TidyTitle ¶
================================================================================================= function TidyTitle brief description: Make a title more tidy before using it. The procedure include the
steps: (1) remove the spaces at the head and the tail of the title, (2) replace "<" with "<", (3) replace ">" with ">", (4) replace "&" with "&", (5) replace """ with "\"", (6) replace "'" with "'", (7) replace "&//[number];" with a corresponding unicode
input:
title: The String (text) of a title.
output:
A string of the tidied-up title.
Types ¶
type CitationGraph ¶
type CitationGraph struct { Nodes map[int64]*CitationNode ToBeAnalyzed []int64 // contains filtered or unexported fields }
================================================================================================= struct CitationGraph brief description: A data structure of citation graph, the data of which are collected from
the website https://academic.microsoft.com , which is the public service website of MAG(Microsoft Academic Graph)
fields:
nodes: The node of struct CitationNode stored in a dictionary with key = id toBeAnalyzed: The list of nodes to be analyzed. In ijcai_clustering dataset, these nodes are those published in IJCAI.
func LoadCitationGraph ¶
func LoadCitationGraph(path string, prefix string) *CitationGraph
================================================================================================= function LoadCitationGraph brief description:
Load data from three files (nodes, edges, labels) of a citation graph.
input:
path: The name of the path where the three files are stored. prefix: The prefix of the names of the three files. For example, the prefix of ijcai-citation-graph-nodes.csv is ijcai.
output:
The citation graph represented by the three files.
func (*CitationGraph) ClusterByLDA ¶
func (g *CitationGraph) ClusterByLDA(numTopics int, alpha, beta float64, numIters int, ) map[int64][]float64
================================================================================================= func (g *CitationGraph) ClusterByLDA brief description: cluster the main nodes of g with their titles and their reference titles using
using LDA.
input:
numTopics: number of topics alpha, beta: the parameters of LDA numIters: number of iterations for LDA
output:
for each main node, gives the likelihoods the node belonging to a cluster.
func (*CitationGraph) ClusterLabelsByGSDMM ¶
func (g *CitationGraph) ClusterLabelsByGSDMM(numTopics int, alpha, beta float64, numIters int, ) map[int64][]float64
}
func (*CitationGraph) ClusterLabelsByWPDM ¶
func (g *CitationGraph) ClusterLabelsByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
================================================================================================= func (g *CitationGraph) ClusterLabelsByWPDM brief description: cluster the main nodes of g with their titles and their reference titles using
using LDA.
input:
eps: the radius of neighborhood. minPts: Only if the neighborhood of a point contains at least minPt points (the center point of the neighborhood included), the neighborhood is called dense. Only dense neighborhoods are connected to communities. simType: the type of similarity, 0 for simple induced similarity, 1 for normalized similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for normalized jaccard similarity workSpaceFileName: a file name for intermediate result
output:
for each main node, gives the likelihoods the node belonging to a cluster.
func (*CitationGraph) ClusterTitlesByGSDMM ¶
func (g *CitationGraph) ClusterTitlesByGSDMM(simType, numTopics int, alpha, beta float64, numIters int, ) map[int64][]float64
================================================================================================= func (g *CitationGraph) ClusterTitlesByGSDMM brief description: cluster the main nodes of g with their titles and their reference titles using
using LDA.
input:
simType: the type of similarity, 0 for simple induced similarity, 1 for normalized similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for normalized jaccard similarity numTopics: number of topics alpha, beta: the parameters of GSDMM numIters: number of iterations
output:
for each main node, gives the likelihoods the node belonging to a cluster.
func (*CitationGraph) ClusterTitlesByWPDM ¶
func (g *CitationGraph) ClusterTitlesByWPDM(eps float64, minPts uint, simType int, workSpaceFileName string) []map[uint]bool
================================================================================================= func (g *CitationGraph) ClusterTitlesByWPDM brief description: cluster the main nodes of g with their titles and their reference titles using
using LDA.
input:
eps: the radius of neighborhood. minPts: Only if the neighborhood of a point contains at least minPt points (the center point of the neighborhood included), the neighborhood is called dense. Only dense neighborhoods are connected to communities. simType: the type of similarity, 0 for simple induced similarity, 1 for normalized similarity, 2 for jaccard similarity, 4 for weighted jaccard similarity, 4 for normalized jaccard similarity workSpaceFileName: a file name for intermediate result
output:
for each main node, gives the likelihoods the node belonging to a cluster.
func (*CitationGraph) CompareByAMI ¶
func (g *CitationGraph) CompareByAMI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByAMI
func (*CitationGraph) CompareByARI ¶
func (g *CitationGraph) CompareByARI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByARI
func (*CitationGraph) CompareByCPM ¶
func (g *CitationGraph) CompareByCPM(gamma float64, communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByCPM
func (*CitationGraph) CompareByMI ¶
func (g *CitationGraph) CompareByMI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByMI
func (*CitationGraph) CompareByModularity ¶
func (g *CitationGraph) CompareByModularity(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByModularity
func (*CitationGraph) CompareByNMI ¶
func (g *CitationGraph) CompareByNMI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByNMI
func (*CitationGraph) CompareByRI ¶
func (g *CitationGraph) CompareByRI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) CompareByRI
func (*CitationGraph) ComputeEMI ¶
func (g *CitationGraph) ComputeEMI(communities []map[uint]bool, memberships map[int64][]float64) float64
================================================================================================= func (g *CitationGraph) ComputeEMI
func (*CitationGraph) ComputeEntropies ¶
func (g *CitationGraph) ComputeEntropies(communities []map[uint]bool, memberships map[int64][]float64) (float64, float64, float64)
================================================================================================= func (g *CitationGraph) ComputeEntropies
func (*CitationGraph) CreateCorpus ¶
func (g *CitationGraph) CreateCorpus(corpusType int) *Corpus
================================================================================================= func (g *CitationGraph) CreateCorpus brief description: create a corpus from a citation graph input
corpusType: 0 for title + ref titles per document for main nodes, 1 for title per document for main nodes, 2 for title per document for all nodes, 3 for labels per document for main nodes,
func (*CitationGraph) CreateCorpusSeq ¶
func (g *CitationGraph) CreateCorpusSeq(corpusType int) (corpus *CorpusSeq, years []int, nodeIDs []int64, isEnglish []bool)
================================================================================================= func (g *CitationGraph) CreateCorpusSeq brief description: create a corpus from a citation graph input
corpusType: 0 for title + ref titles per document for main nodes, 1 for title per document for main nodes, 2 for title per document for all nodes, 3 for labels per document for main nodes,
func (*CitationGraph) CreateCorpusX ¶
func (g *CitationGraph) CreateCorpusX(corpusType int) *CorpusX
================================================================================================= func (g *CitationGraph) CreateCorpusX brief description: create a corpusX from a citation graph input
corpusType: 0 for title + ref titles per document for main nodes, 1 for title per document for main nodes, 2 for title per document for all nodes,
func (*CitationGraph) GetCommunitiesFromMemberships ¶
func (g *CitationGraph) GetCommunitiesFromMemberships(memberships map[int64][]float64) []map[uint]bool
================================================================================================= func (g *CitationGraph) getCommunitiesFromMemberships
func (*CitationGraph) GetEmergingTopicPublications ¶
func (g *CitationGraph) GetEmergingTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
================================================================================================= func (g *CitationGraph) GetEmergingTopicPublications note: The publications of emerging topic have the following characteristics:
(1) cold start: in the year of publication, it receives citations <= lowThreshold per year (2) break out: in recent years, it receives avg citations >= highThreshold per year
func (*CitationGraph) GetEmergingTrends ¶
func (g *CitationGraph) GetEmergingTrends(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[string][]int
================================================================================================= func (g *CitationGraph) GetEmergingTrends
func (*CitationGraph) GetHotTopicPublications ¶
func (g *CitationGraph) GetHotTopicPublications(yearToday, yearRecent, yearFarAway, lowThreshold, highThreshold int) map[int64][]int
================================================================================================= func (g *CitationGraph) GetHotTopicPublications note: The publications of hot topic have the following characteristics:
hot start: in the year of publication or next year, it receives citations >= highThreshold per year
func (*CitationGraph) GetIdxMainNode ¶
func (g *CitationGraph) GetIdxMainNode(nodeID int64) uint
================================================================================================= func (g *CitationGraph) GetIdxMainNode
func (*CitationGraph) GetPhraseSimilarity ¶
func (g *CitationGraph) GetPhraseSimilarity(simType int) map[string]map[string]float64
================================================================================================= method GetPhraseSimilarity brief description: compute the similarities between pairs of phrases and gives them in form of a
sparse matrix using Title Link method
input:
nothing
output:
the sparse matrix of phrase similarities
note:
The title link method can be found in: Bogomolova, A., Ryazanova, M., & Balk, I. (2021). Cluster approach to analysis of publication titles. In Journal of Physics: Conference Series (Vol. 1727, No. 1, p. 012016). IOP Publishing.
func (*CitationGraph) GetPhraseSimilarityX ¶
func (g *CitationGraph) GetPhraseSimilarityX(simType int) map[string]map[string]float64
================================================================================================= method GetPhraseSimilarityX brief description: compute the similarities between pairs of phrases and gives them in form of a
sparse matrix using Title Link method
input:
nothing
output:
the sparse matrix of phrase similarities
note:
The title link method can be found in: Bogomolova, A., Ryazanova, M., & Balk, I. (2021). Cluster approach to analysis of publication titles. In Journal of Physics: Conference Series (Vol. 1727, No. 1, p. 012016). IOP Publishing.
func (*CitationGraph) GetStronglyConnectedPhrases ¶
func (g *CitationGraph) GetStronglyConnectedPhrases(thresFreq, thresRatio float64) map[PhrasePair]PairFreq
================================================================================================= func (g *CitationGraph) GetStronglyConnectedPhrases
func (*CitationGraph) Leap2Trend ¶
func (g *CitationGraph) Leap2Trend(fileNamePrefix string, yearStartFrom, yearEndWith, minFreq, minJump int)
func (*CitationGraph) SaveWord2VecTrainingData ¶
func (g *CitationGraph) SaveWord2VecTrainingData(fileNamePrefix string, numIters, minFreq int, minScore float64, useW2PEx bool, yearStartFrom int)
================================================================================================= func (g *CitationGraph) SaveWord2VecTrainingData brief description:
save tokenized and stemmed publication titles to a file for training of word2vec
func (*CitationGraph) SimTFIDF ¶
func (g *CitationGraph) SimTFIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
================================================================================================= method SimTFIDF brief description: compute the Fuzzy TFIDF of possible key phrases for each main node of a citationGraph input:
phraseSimilarity: similarities of phrases
output:
the result of Fuzzy TFIDF grouped by the main nodes of the CitationGraph
func (*CitationGraph) SimTFSimIDF ¶
func (g *CitationGraph) SimTFSimIDF(phraseSimilarity map[string]map[string]float64) []map[string]float64
================================================================================================= method SimTFSimIDF brief description: compute the Fuzzy TFIDF of possible key phrases for each main node of a citationGraph input:
nothing
output:
the result of Fuzzy TFIDF grouped by the main nodes of the CitationGraph
func (*CitationGraph) SortByYear ¶
func (g *CitationGraph) SortByYear() map[int][]string
================================================================================================= func (g *CitationGraph) SortByYear brief description: return a sorted list of titles input:
None
output:
A map with year as key and slice of titles as value.
func (*CitationGraph) TFIDF ¶
func (g *CitationGraph) TFIDF() []map[string]float64
================================================================================================= method TFIDF brief description: compute the TFIDF of possible key phrases for each main node of a citationGraph input:
nothing
output:
the result of TFIDF grouped by the main nodes of the CitationGraph
type CitationNode ¶
type CitationNode struct { ID int64 Year int64 Title string Labels []string Refs []int64 Cites []int64 }
================================================================================================= struct CitationNode brief description: The node structure of a citation graph, the data of which are collected
from crawling the website https://academic.microsoft.com , which is the public service website of MAG(Microsoft Academic Graph)
fields:
id: The MAG(Microsoft Academic Graph) id of the paper at this node. We can access the detail of the paper with the id by navigating the web link: https://academic.microsoft.com/paper/$id year: The year of publication of this paper. title: The title of the paper. labels: The labels from MAG(Microsoft Academic Graph). refs: The references of this paper collected from MAG. Please note that this field could be inaccurate: MAG suffers delay of info. Many new papers with refs don't have refs listed in MAG. cites: The citations to this paper from other papers (also collected from MAG). Please note that this field is as inaccurate as refs due to the same reason.
type Corpus ¶
type Corpus struct { Vocab map[string]int // the vocabulary Docs []map[int]int // keys: docID, wordID, value: word count }
================================================================================================= struct Corpus brief description: the corpus data structure
func NewCorpus ¶
func NewCorpus() *Corpus
================================================================================================= func NewCorpus brief description: create an empty corpus
func (*Corpus) AddDoc ¶
================================================================================================= func (this *Corpus) AddDoc brief description: add one document to corpus with specified docId and word count list, if the
specified docId already exists in corpus, the old doc will be overwritted
type CorpusSeq ¶
type CorpusSeq struct { Vocab map[string]int // the vocabulary Docs [][]int // keys: docID, wordIdx, value: word id }
================================================================================================= struct CorpusSeq brief description: the extended corpus data structure
func NewCorpusSeq ¶
func NewCorpusSeq() *CorpusSeq
================================================================================================= func NewCorpusSeq brief description: create an empty corpus
func (*CorpusSeq) AddDoc ¶
================================================================================================= func (this *CorpusSeq) AddDoc brief description: add one document to corpus with specified docId and word count list, if the
specified docId already exists in corpus, the old doc will be overwritted
func (*CorpusSeq) GetConcurrences ¶
================================================================================================= func (this *CorpusSeq) GetConcurrences brief description: get concurrences from corpus
func (*CorpusSeq) Word2Phrase ¶
================================================================================================= func (this *CorpusSeq) Word2Phrase
type CorpusX ¶
type CorpusX struct { Vocab map[string]int // the vocabulary Docs [][]map[int]int // keys: docID, wordID, value: word count }
================================================================================================= struct CorpusX brief description: the extended corpus data structure
func NewCorpusX ¶
func NewCorpusX() *CorpusX
================================================================================================= func NewCorpusX brief description: create an empty corpus
func (*CorpusX) AddDoc ¶
================================================================================================= func (this *CorpusX) AddDoc brief description: add one document to corpus with specified docId and word count list, if the
specified docId already exists in corpus, the old doc will be overwritted
func (*CorpusX) GetConcurrences ¶
================================================================================================= func (this *CorpusX) GetConcurrences brief description: get concurrences from corpus
func (*CorpusX) GetDocConcurrences ¶
================================================================================================= func (this *CorpusX) GetConcurrences brief description: get concurrences from corpus
type DocWord ¶
================================================================================================= struct DocWord
type GSDMM ¶
type GSDMM struct { Alpha float64 // document topic mixture hyperparameter Beta float64 // topic word mixture hyperparameter NumTopics int // number of topics Data *Corpus // the input corpus NumWordsInDoc []int // the number of words in documents DocTopic []int // doc-topic count table TopicWordCount [][]int // word-topic count table TopicWordCountSum []int // word-topic-sum count table TopicDocCount []int // topic-doc-sum count table }
================================================================================================= struct GSDMM brief description: the data structure of GSDMM model
func NewGSDMM ¶
================================================================================================= func NewGSDMM brief description: create an LDA instance with collapsed gibbs sampler
func (*GSDMM) ComputeEntropy ¶
================================================================================================= func (this *GSDMM) ComputeEntropy brief description: compute entropy
func (*GSDMM) ComputeRelativeEntropy ¶
================================================================================================= func (this *GSDMM) ComputeEntropy brief description: compute entropy
func (*GSDMM) Infer ¶
================================================================================================= func (this *GSDMM) Infer brief description: infer topics on new documents
func (*GSDMM) Init ¶
func (this *GSDMM) Init()
================================================================================================= func (this *GSDMM) Init
func (*GSDMM) ResampleTopics ¶
================================================================================================= func (this *GSDMM) ResampleTopics
type LDA ¶
type LDA struct { Alpha float64 // document topic mixture hyperparameter Beta float64 // topic word mixture hyperparameter NumTopics int // number of topics Data *Corpus // the input corpus WordTopicCount [][]int // word-topic count table DocTopicCount [][]int // doc-topic count table TopicCountSum []int // word-topic-sum count table DocWordToTopic map[DocWord][]int // doc-word-topic count table }
================================================================================================= struct LDA brief description: the data structure of LDA model with Collapsed Gibbs Sampler note:
The fast collapsed gibbs sampler algorithm can be found in reference: Porteous, I., Newman, D., Ihler, A., Asuncion, A., Smyth, P., & Welling, M. (2008, August). Fast collapsed gibbs sampling for latent dirichlet allocation. In Proceedings of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 569-577).
func NewLDA ¶
================================================================================================= func NewLDA brief description: create an LDA instance with collapsed gibbs sampler
func (*LDA) ComputeEntropy ¶
================================================================================================= func (this *LDA) ComputeEntropy brief description: compute entropy
func (*LDA) ComputeRelativeEntropy ¶
================================================================================================= func (this *LDA) ComputeEntropy brief description: compute entropy
func (*LDA) Infer ¶
================================================================================================= func (this *LDA) Infer brief description: infer topics on new documents
func (*LDA) Init ¶
func (this *LDA) Init()
================================================================================================= func (this *LDA) Init
func (*LDA) ResampleTopics ¶
================================================================================================= func (this *LDA) ResampleTopics
type PhrasePair ¶
type PhrasePair struct {
Phrase1, Phrase2 string
}
================================================================================================= struct PhrasePair
type TopicModel ¶
type TopicModel interface { // train model for iter iteration Train(numIters int) // do inference for new doc with its wordCounts Infer(doc map[int]int) []float64 // compute entropy ComputeEntropy() float64 // compute relative entropy ComputeRelativeEntropy() float64 }
================================================================================================= interface LDAModel brief description: the common interface of LDA models
type WPDM ¶
type WPDM struct { Data *Corpus // the input corpus DocTopic []int // doc-topic table NumTopics int // number of topics // contains filtered or unexported fields }
================================================================================================= struct WPDM brief description: the data structure of WPDM model