mapkha

package module
v0.0.0-...-ffa9815 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 20, 2017 License: LGPL-3.0 Imports: 5 Imported by: 0

README

Mapkha

Thai word segmentation (wordcut; word boundary identification; ตัดคำ) program in Go (golang)

Example

package main

import ("fmt"
    "strings"
    "bufio"
    "os"
    m "github.com/veer66/mapkha"
)

func check(e error) {
    if e != nil {
        panic(e)
    }
}

func main() {
    dict, e := m.LoadDefaultDict()
    check(e)
    wordcut := m.NewWordcut(dict)
    scanner := bufio.NewScanner(os.Stdin)
    for scanner.Scan() {
        fmt.Println(strings.Join(wordcut.Segment(scanner.Text()), "|"))
    }
}

Documentation

Index

Constants

View Source
const (
	DICT  Etype = 1
	UNK         = 2
	INIT        = 3
	LATIN       = 4
	SPACE       = 5
)

Variables

This section is empty.

Functions

This section is empty.

Types

type AccPool

type AccPool struct {
	// contains filtered or unexported fields
}

AccPool - pool of dict acceptor

func NewAccPool

func NewAccPool() *AccPool

NewAccPool - build acceptor pool

func (*AccPool) Obtain

func (pool *AccPool) Obtain(p int) *DictAcceptor

Obtain - obtain dict acceptor at p

func (*AccPool) Reset

func (pool *AccPool) Reset()

Reset - reset acceptor pool

type Dict

type Dict struct {
	// contains filtered or unexported fields
}

Dict is a prefix tree

func LoadDefaultDict

func LoadDefaultDict() (*Dict, error)

LoadDefaultDict - loading default Thai dictionary

func LoadDict

func LoadDict(path string) (*Dict, error)

LoadDict is for loading a word list from file

func MakeDict

func MakeDict(words []string) *Dict

func (*Dict) Lookup

func (d *Dict) Lookup(p int, offset int, ch rune) (*PrefixTreePointer, bool)

Lookup - lookup node in a Prefix Tree

type DictAcceptor

type DictAcceptor struct {
	// contains filtered or unexported fields
}

func (*DictAcceptor) Reset

func (a *DictAcceptor) Reset(p int)

Reset - reset internal state

func (*DictAcceptor) Transit

func (a *DictAcceptor) Transit(ch rune, dict *Dict)

Transit - walk on prefix tree by new rune

type DictEdgeBuilder

type DictEdgeBuilder struct {
	// contains filtered or unexported fields
}

func NewDictEdgeBuilder

func NewDictEdgeBuilder(dict *Dict) *DictEdgeBuilder

func (*DictEdgeBuilder) Build

func (builder *DictEdgeBuilder) Build(context *EdgeBuildingContext) *Edge

Build - build new edge from dictionary

func (*DictEdgeBuilder) Reset

func (builder *DictEdgeBuilder) Reset()

type Edge

type Edge struct {
	S         int
	EdgeType  Etype
	WordCount int
	UnkCount  int
}

Edge - edge of word graph

func (*Edge) IsBetterThan

func (edge *Edge) IsBetterThan(another *Edge) bool

IsBetterThan - comparing this edge to another edge

type EdgeBuilder

type EdgeBuilder interface {
	Build(*EdgeBuildingContext) *Edge
	Reset()
}

type EdgeBuildingContext

type EdgeBuildingContext struct {
	I            int
	Ch           rune
	Path         []*Edge
	LeftBoundary int
	BestEdge     *Edge
	// contains filtered or unexported fields
}

type Etype

type Etype int

type Index

type Index struct {
	// contains filtered or unexported fields
}

func MakeIndex

func MakeIndex(rwords [][]rune) *Index

func (*Index) Get0

func (idx *Index) Get0(policy Policy, ch rune) (int, bool)

type PatEdgeBuilder

type PatEdgeBuilder struct {
	// contains filtered or unexported fields
}

func (*PatEdgeBuilder) Build

func (builder *PatEdgeBuilder) Build(context *EdgeBuildingContext) *Edge

func (*PatEdgeBuilder) Reset

func (builder *PatEdgeBuilder) Reset()

type Policy

type Policy int
const (
	LEFT  Policy = 1
	RIGHT        = 2
)

type PrefixTree

type PrefixTree struct {
	// contains filtered or unexported fields
}

PrefixTree is a Hash-based Prefix Tree for searching words

func MakePrefixTree

func MakePrefixTree(wordsWithPayload []WordWithPayload) *PrefixTree

MakePrefixTree is for constructing prefix tree for word with payload list

func (*PrefixTree) Lookup

func (tree *PrefixTree) Lookup(nodeID int, offset int, ch rune) (*PrefixTreePointer, bool)

Lookup - look up prefix tree from node-id, offset and a character

type PrefixTreeNode

type PrefixTreeNode struct {
	NodeID int
	Offset int
	Ch     rune
}

PrefixTreeNode represents node in a prefix tree

type PrefixTreePointer

type PrefixTreePointer struct {
	ChildID int
	IsFinal bool
	Payload interface{}
}

PrefixTreePointer is partial information of edge

type TextRange

type TextRange struct {
	// contains filtered or unexported fields
}

func GraphToRanges

func GraphToRanges(path []Edge) []TextRange

Improved as Roger Peppe suggested in his tweet https://twitter.com/rogpeppe/status/574911374645682176

type UnkEdgeBuilder

type UnkEdgeBuilder struct {
}

func (*UnkEdgeBuilder) Build

func (builder *UnkEdgeBuilder) Build(context *EdgeBuildingContext) *Edge

Build - build dummy edge when there is no edge created.

func (*UnkEdgeBuilder) Reset

func (builder *UnkEdgeBuilder) Reset()

type WordWithPayload

type WordWithPayload struct {
	Word    string
	Payload interface{}
}

WordWithPayload is a pair of word and its payload

type Wordcut

type Wordcut struct {
	// contains filtered or unexported fields
}

func NewWordcut

func NewWordcut(dict *Dict) *Wordcut

func (*Wordcut) Reset

func (w *Wordcut) Reset()

func (*Wordcut) Segment

func (w *Wordcut) Segment(text string) []string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL