tokenizer

package
v0.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 20, 2022 License: Apache-2.0 Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func NewCharacterTokenizerWithOptions

func NewCharacterTokenizerWithOptions(opts map[string]interface{}) (*tokenizer.CharacterTokenizer, error)

Create new CharacterTokenizer with given options. Options example:

{
  "rune": "graphic"
}

func NewExceptionsTokenizerWithOptions

func NewExceptionsTokenizerWithOptions(opts map[string]interface{}) (*tokenizer.ExceptionsTokenizer, error)

Create new ExceptionsTokenizer with given options. Options example:

{
  "patterns": [
    "[hH][tT][tT][pP][sS]?://(\S)*",
    "[fF][iI][lL][eE]://(\S)*",
    "[fF][tT][pP]://(\S)*",
    "\S+@\S+"
  ]
}

func NewKagomeTokenizerWithOptions

func NewKagomeTokenizerWithOptions(opts map[string]interface{}) (analysis.Tokenizer, error)

Create new KagomeTokenizer with given options. Options example:

{
  "dictionary": "IPADIC",
  "stop_tags": [
    "接続詞",
    "助詞",
    "助詞-格助詞",
    "助詞-格助詞-一般",
    "助詞-格助詞-引用",
    "助詞-格助詞-連語",
    "助詞-接続助詞",
    "助詞-係助詞",
    "助詞-副助詞",
    "助詞-間投助詞",
    "助詞-並立助詞",
    "助詞-終助詞",
    "助詞-副助詞/並立助詞/終助詞",
    "助詞-連体化",
    "助詞-副詞化",
    "助詞-特殊",
    "助動詞",
    "記号",
    "記号-一般",
    "記号-読点",
    "記号-句点",
    "記号-空白",
    "記号-括弧開",
    "記号-括弧閉",
    "その他-間投",
    "フィラー",
    "非言語音"
  ],
  "base_forms": [
    "動詞",
    "形容詞",
    "形容動詞"
  ]
}

func NewRegexpTokenizerWithOptions

func NewRegexpTokenizerWithOptions(opts map[string]interface{}) (*tokenizer.RegexpTokenizer, error)

Create new RegexTokenizer with given options. Options example:

{
  "pattern": "[0-9a-zA-Z_]*"
}

Types

type Tokenizer

type Tokenizer string
const (
	CharacterTokenizer   Tokenizer = "character"
	ExceptionTokenizer   Tokenizer = "exception"
	KagomeTokenizer      Tokenizer = "kagome"
	LetterTokenizer      Tokenizer = "letter"
	RegexpTokenizer      Tokenizer = "regexp"
	SingleTokenTokenizer Tokenizer = "single_token"
	UnicodeTokenizer     Tokenizer = "unicode"
	WebTokenizer         Tokenizer = "web"
	WhitespaceTokenizer  Tokenizer = "whitespace"
)

type TokenizerSetting

type TokenizerSetting struct {
	Name    Tokenizer              `json:"name"`
	Options map[string]interface{} `json:"options"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL