go_segment/

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/gyuho/learn

Links

Open Source Insights

README ¶

back to contents

Go: segment

Reference
Segment
Segment algorithm

↑ top

Reference

↑ top

Segment

Wikipedia explains a segment is "any discrete unit that can be identified, either physically or auditorily, in the stream of speech". The goal of segmentation is: Ilovecomputerscience. → I love computer science.

↑ top

Segment algorithm

This algorithm is based on http://norvig.com/ngrams/ch14.pdf:

Read the corpus data.
Count the frequency of each word.
Do smoothing over the words not in the corpus.
Consider every possible way to split the text.
Return the segmentation with the highest probability.

We can split a text into a first word and a remaining text. For each possible split, find the best way to segment the remainder. And return the one with the highest product of P(first) × P(remaining).

package main

import (
	"bufio"
	"fmt"
	"io"
	"log"
	"math"
	"os"
	"strings"
)

func open(filename string) *os.File {
	f, err := os.Open(filename)
	if err != nil {
		log.Fatalf("Unable to read file: %+v", err)
	}
	return f
}

func main() {
	pFunc := Probability(open("sample.txt"))
	rs := Get("IamCodingwithGO.Ireallyenjoythislanguage.", pFunc)
	fmt.Println(rs)
	// [IamCodingwithGO.I really enjoy this language .]
}

// probability calculates frequency probability.
// For each word, divide frequency by the number of words.
func probability(reader io.Reader) map[string]float64 {
	scanner := bufio.NewScanner(reader)
	//
	// This must be called before Scan.
	// The default split function is bufio.ScanLines.
	scanner.Split(bufio.ScanWords)
	//
	pmap := make(map[string]float64)
	//
	var length float64
	for scanner.Scan() {
		// Remove all leading and trailing Unicode code points.
		word := strings.Trim(scanner.Text(), ",-!;:\"?.")
		if _, exist := pmap[word]; exist {
			pmap[word]++
		} else {
			pmap[word] = 1
		}
		// keep increasing while reading(scanning)
		length++
	}
	for k, v := range pmap {
		pmap[k] = v / length
	}
	return pmap
}

// Probability returns the word probability,
// with smoothing.
func Probability(reader io.Reader) func(string) float64 {
	pmap := probability(reader)
	return func(word string) float64 {
		if score, ok := pmap[word]; ok {
			return score
		}
		// if the word has never showed up, smooth.
		return 10 / (float64(len(pmap)) * math.Pow(10, float64(len(word))))
	}
}

type split struct {
	Head string
	Tail string
}

func doSplit(txt string) []split {
	splits := []split{}
	for i := range txt {
		splits = append(splits, split{txt[:i+1], txt[i+1:]})
	}
	return splits
}

func mostPlausible(chunks [][]string, probFunc func(string) float64) []string {
	chunk := []string{}
	min := -1 * math.MaxFloat64
	for _, words := range chunks {
		score := 1.0
		for _, elem := range words {
			score *= probFunc(elem)
		}
		if min < score {
			min = score
			chunk = words
		}
	}
	return chunk
}

// prev stores previously found segmentations.
var prev = map[string][]string{}

// Get returns the highest-scoring segmentation.
func Get(txt string, probFunc func(string) float64) []string {
	if len(txt) == 0 {
		return []string{}
	}
	if result, ok := prev[txt]; ok {
		return result
	}
	chunks := [][]string{}
	for _, split := range doSplit(txt) {
		chunks = append(chunks,
			append([]string{split.Head},
				Get(split.Tail, probFunc)...,
			),
		)
	}
	rs := mostPlausible(chunks, probFunc)
	prev[txt] = rs
	return rs
}

↑ top

Directories ¶

Path	Synopsis
code

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL