documents

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 5, 2024 License: Apache-2.0 Imports: 14 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// NamespaceArticle applies to pages containing written content of Wikipedia: the text itself and what people
	// generally mean when they say "Wikipedia".
	NamespaceArticle Namespace = iota
	// NamespaceTalk pages have a corresponding Article, and are where editors discuss the content of the Article.
	NamespaceTalk
	NamespaceUser
	NamespaceUserTalk
	// NamespaceWikipedia pages are the procedural guidelines of how Wikipedia is run and maintained, such as style
	// guides.
	NamespaceWikipedia
	// NamespaceWikipediaTalk pages are where editors discuss how Wikipedia is run and propose changes.
	NamespaceWikipediaTalk
	NamespaceFile
	NamespaceFileTalk
	NamespaceMediaWiki
	NamespaceMediaWikiTalk
	NamespaceTemplate
	NamespaceTemplateTalk
	NamespaceHelp
	NamespaceHelpTalk
	NamespaceCategory
	NamespaceCategoryTalk
	NamespacePortal        = 100
	NamespacePortalTalk    = 101
	NamespaceDraft         = 118
	NamespaceDraftTalk     = 119
	NamespaceTimedText     = 710
	NamespaceTimedTextTalk = 711
	NamespaceModule        = 828
	NamespaceModuleTalk    = 829
	NamespaceMedia         = -2
	NamespaceSpecial       = -1
)

Variables

View Source
var File_pkg_documents_documents_proto protoreflect.FileDescriptor
View Source
var Missed = 0

Functions

func DisambiguateTags

func DisambiguateTags(page, category string) string

func ReadPages

func ReadPages(pages chan<- *Page) func([]byte) error

func WriteFrequencyTable

func WriteFrequencyTable(out string, t FrequencyTable) error

Types

type Categories

type Categories struct {
	Categories []uint32 `protobuf:"varint,1,rep,packed,name=categories,proto3" json:"categories,omitempty"`
	// contains filtered or unexported fields
}

func (*Categories) Add

func (x *Categories) Add(parent uint32)

func (*Categories) Descriptor deprecated

func (*Categories) Descriptor() ([]byte, []int)

Deprecated: Use Categories.ProtoReflect.Descriptor instead.

func (*Categories) GetCategories

func (x *Categories) GetCategories() []uint32

func (*Categories) ProtoMessage

func (*Categories) ProtoMessage()

func (*Categories) ProtoReflect

func (x *Categories) ProtoReflect() protoreflect.Message

func (*Categories) Reset

func (x *Categories) Reset()

func (*Categories) String

func (x *Categories) String() string

type Categorizer

type Categorizer struct {
	TitleIndex *TitleIndex
}

func (*Categorizer) Categorize

func (c *Categorizer) Categorize(page *Page) *Categories

type Document

type Document struct {
	Pages []*Page `protobuf:"bytes,1,rep,name=pages,proto3" json:"pages,omitempty"`
	// contains filtered or unexported fields
}

func (*Document) Descriptor deprecated

func (*Document) Descriptor() ([]byte, []int)

Deprecated: Use Document.ProtoReflect.Descriptor instead.

func (*Document) GetPages

func (x *Document) GetPages() []*Page

func (*Document) ProtoMessage

func (*Document) ProtoMessage()

func (*Document) ProtoReflect

func (x *Document) ProtoReflect() protoreflect.Message

func (*Document) Reset

func (x *Document) Reset()

func (*Document) String

func (x *Document) String() string

type Frequency

type Frequency struct {
	Word  string
	Count int
}

type FrequencyMap

type FrequencyMap struct {
	Counts map[string]int
}

func (*FrequencyMap) Collect

func (f *FrequencyMap) Collect(words <-chan string)

Collect reads the words in a channel into a frequency table.

func (*FrequencyMap) CollectMaps

func (f *FrequencyMap) CollectMaps(
	wordCountChannel <-chan map[string]int,
	countFilter,
	sizeThreshold int,
) *sync.WaitGroup

func (*FrequencyMap) Filter

func (f *FrequencyMap) Filter(minCount int)

Filter drops all words which have been seen fewer than minCount times.

type FrequencyTable

type FrequencyTable struct {
	Frequencies []Frequency
}

func ReadFrequencyTables

func ReadFrequencyTables(paths ...string) (*FrequencyTable, error)

func ToFrequencyTable

func ToFrequencyTable(wordCounts map[string]int) FrequencyTable

func (*FrequencyTable) ToNgramDictionary

func (t *FrequencyTable) ToNgramDictionary() map[string]bool

type Namespace

type Namespace int16

A Namespace is a page property that refers to its function within Wikipedia. pages with the same Namespace have the same structure and function, and can usually be processed together.

type Page

type Page struct {
	Id    uint32 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"`
	Title string `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"`
	Text  string `protobuf:"bytes,3,opt,name=text,proto3" json:"text,omitempty"`
	// contains filtered or unexported fields
}

func (*Page) Descriptor deprecated

func (*Page) Descriptor() ([]byte, []int)

Deprecated: Use Page.ProtoReflect.Descriptor instead.

func (*Page) GetID

func (x *Page) GetID() uint32

func (*Page) GetText

func (x *Page) GetText() string

func (*Page) GetTitle

func (x *Page) GetTitle() string

func (*Page) ID

func (x *Page) ID() uint32

func (*Page) ProtoMessage

func (*Page) ProtoMessage()

func (*Page) ProtoReflect

func (x *Page) ProtoReflect() protoreflect.Message

func (*Page) Reset

func (x *Page) Reset()

func (*Page) String

func (x *Page) String() string

type PageCategories

type PageCategories struct {
	Pages map[uint32]*Categories `` /* 152-byte string literal not displayed */
	// contains filtered or unexported fields
}

func (*PageCategories) Add

func (x *PageCategories) Add(child, parent uint32)

func (*PageCategories) Descriptor deprecated

func (*PageCategories) Descriptor() ([]byte, []int)

Deprecated: Use PageCategories.ProtoReflect.Descriptor instead.

func (*PageCategories) GetPages

func (x *PageCategories) GetPages() map[uint32]*Categories

func (*PageCategories) ProtoMessage

func (*PageCategories) ProtoMessage()

func (*PageCategories) ProtoReflect

func (x *PageCategories) ProtoReflect() protoreflect.Message

func (*PageCategories) Reset

func (x *PageCategories) Reset()

func (*PageCategories) String

func (x *PageCategories) String() string

type TitleIndex

type TitleIndex struct {
	Titles map[string]uint32 `` /* 154-byte string literal not displayed */
	// contains filtered or unexported fields
}

func (*TitleIndex) Descriptor deprecated

func (*TitleIndex) Descriptor() ([]byte, []int)

Deprecated: Use TitleIndex.ProtoReflect.Descriptor instead.

func (*TitleIndex) GetTitles

func (x *TitleIndex) GetTitles() map[string]uint32

func (*TitleIndex) ProtoMessage

func (*TitleIndex) ProtoMessage()

func (*TitleIndex) ProtoReflect

func (x *TitleIndex) ProtoReflect() protoreflect.Message

func (*TitleIndex) Reset

func (x *TitleIndex) Reset()

func (*TitleIndex) String

func (x *TitleIndex) String() string

type WordSet

type WordSet struct {
	// ID is the article ID
	ID int
	// Words is the sorted list of top words in the document.
	Words []uint16
}

type WordSets

type WordSets struct {
	InFile    string `json:"in_file,omitempty"`
	Documents []WordSet
}

type XMLDocument

type XMLDocument struct {
	Pages []XMLPage `xml:"page"`
}

XMLDocument solely exists for extracting.

func (*XMLDocument) ToProto

func (d *XMLDocument) ToProto() *Document

type XMLPage

type XMLPage struct {
	Title    string      `xml:"title"`
	NS       Namespace   `xml:"ns"`
	ID       uint32      `xml:"id"`
	Redirect XMLRedirect `yaml:",omitempty" xml:"redirect"`
	Revision XMLRevision `xml:"revision"`
}

func (*XMLPage) ToProto

func (p *XMLPage) ToProto() *Page

type XMLRedirect

type XMLRedirect struct {
	Title string `yaml:",omitempty" xml:"title,attr"`
}

type XMLRevision

type XMLRevision struct {
	Text string `xml:"text"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL