filter

package
v2.9.7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 20, 2024 License: MIT Imports: 6 Imported by: 8

Documentation

Overview

Package filter prepares the inputs and outputs.

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func Drop

func Drop(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)

Drop drops a token given the provided match function.

func Keep

func Keep(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)

Keep keeps a token given the provided match function.

func ScanSentences

func ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)

ScanSentences implements SplitFunc interface of bufio.Scanner that returns each sentence of text. see. https://pkg.go.dev/bufio#SplitFunc

Example
package main

import (
	"bufio"
	"fmt"
	"strings"

	"github.com/ikawaha/kagome/v2/filter"
)

func main() {
	sampleText := ` 人魚は、南の方の海にばかり棲んでいるのではあ
                         りません。北の海にも棲んでいたのであります。
                          北方の海うみの色は、青うございました。ある
                         とき、岩の上に、女の人魚があがって、あたりの景
                         色をながめながら休んでいました。

                         小川未明作 赤い蝋燭と人魚より`

	scanner := bufio.NewScanner(strings.NewReader(sampleText))
	scanner.Split(filter.ScanSentences)
	for scanner.Scan() {
		fmt.Println(scanner.Text())
	}
	if err := scanner.Err(); err != nil {
		panic(err)
	}
}
Output:

人魚は、南の方の海にばかり棲んでいるのではありません。
北の海にも棲んでいたのであります。
北方の海うみの色は、青うございました。
あるとき、岩の上に、女の人魚があがって、あたりの景色をながめながら休んでいました。
小川未明作赤い蝋燭と人魚より

Types

type Feature

type Feature = string

Feature represents a feature.

const Any Feature = "\x00"

Any represents an arbitrary feature.

type Features

type Features = []string

Features represents a vector of features.

type FeaturesFilter

type FeaturesFilter struct {
	// contains filtered or unexported fields
}

FeaturesFilter represents a filter that filters a vector of features.

func NewFeaturesFilter

func NewFeaturesFilter(fs ...Features) *FeaturesFilter

NewFeaturesFilter returns a features filter.

func (*FeaturesFilter) Match

func (f *FeaturesFilter) Match(fs Features) bool

Match returns true if a filter matches given features.

func (*FeaturesFilter) String

func (f *FeaturesFilter) String() string

String implements string interface.

type POS

type POS = []string

POS represents a part-of-speech that is a vector of features.

type POSFilter

type POSFilter struct {
	// contains filtered or unexported fields
}

POSFilter represents a part-of-speech filter.

Example
package main

import (
	"fmt"

	"github.com/ikawaha/kagome-dict/dict"
	"github.com/ikawaha/kagome/v2/filter"
	"github.com/ikawaha/kagome/v2/tokenizer"
)

const testDictPath = "../testdata/ipa.dict"

func main() {
	d, err := dict.LoadDictFile(testDictPath)
	if err != nil {
		panic(err)
	}
	t, err := tokenizer.New(d, tokenizer.OmitBosEos())
	if err != nil {
		panic(err)
	}
	posFilter := filter.NewPOSFilter([]filter.POS{
		{"名詞", filter.Any, "人名"},
		{"形容詞"},
	}...)
	tokens := t.Tokenize("赤い蝋燭と人魚。小川未明")
	posFilter.Keep(&tokens)
	for _, v := range tokens {
		fmt.Println(v.Surface, v.POS())
	}
}
Output:

赤い [形容詞 自立 * *]
小川 [名詞 固有名詞 人名 姓]
未明 [名詞 固有名詞 人名 名]

func NewPOSFilter

func NewPOSFilter(p ...POS) *POSFilter

NewPOSFilter returns a part-of-speech filter.

func (POSFilter) Drop

func (f POSFilter) Drop(tokens *[]tokenizer.Token)

Drop drops a token if a filter matches token's POS.

func (POSFilter) Keep

func (f POSFilter) Keep(tokens *[]tokenizer.Token)

Keep keeps a token if a filter matches token's POS.

func (POSFilter) Match

func (f POSFilter) Match(p POS) bool

Match returns true if a filter matches given POS.

type SentenceSplitter

type SentenceSplitter struct {
	Delim               []rune // delimiter set. ex. {'。','.'}
	Follower            []rune // allow following after delimiters. ex. {'」','』'}
	SkipWhiteSpace      bool   // eliminate white space or not
	DoubleLineFeedSplit bool   // splite at '\n\n' or not
	MaxRuneLen          int    // max sentence length
}

SentenceSplitter is a tiny sentence splitter for japanese texts.

func (SentenceSplitter) ScanSentences

func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)

ScanSentences is a split function for a Scanner that returns each sentence of text. nolint: gocyclo

type WordFilter

type WordFilter struct {
	// contains filtered or unexported fields
}

WordFilter represents a word filter.

Example
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
	panic(err)
}
t, err := tokenizer.New(d, tokenizer.OmitBosEos())
if err != nil {
	panic(err)
}
stopWords := filter.NewWordFilter([]string{"私", "は", "が", "の", "。"})
tokens := t.Tokenize("私の猫の名前はアプロです。")
stopWords.Drop(&tokens)
for _, v := range tokens {
	fmt.Println(v.Surface)
}
Output:

猫
名前
アプロ
です

func NewWordFilter

func NewWordFilter(words []string) *WordFilter

NewWordFilter returns a word filter.

func (WordFilter) Drop

func (f WordFilter) Drop(tokens *[]tokenizer.Token)

Drop drops a token if a filter matches token's surface.

func (WordFilter) Keep

func (f WordFilter) Keep(tokens *[]tokenizer.Token)

Keep keeps a token if a filter matches token's surface.

func (WordFilter) Match

func (f WordFilter) Match(w string) bool

Match returns true if a filter matches a given word.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL