sentencepiece

package

v0.0.0-...-e4229d2 Latest Latest Go to latest Published: Oct 9, 2023 License: MIT Imports: 14 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/TiregeRRR/go-sentencepiece

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
type ModelProto
- func (*ModelProto) Descriptor() ([]byte, []int)deprecated
- func (*ModelProto) ExtensionRangeArray() []protoiface.ExtensionRangeV1deprecated
- func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec
- func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec
- func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece
- func (x *ModelProto) GetSelfTestData() *SelfTestData
- func (x *ModelProto) GetTrainerSpec() *TrainerSpec
- func (*ModelProto) ProtoMessage()
- func (x *ModelProto) ProtoReflect() protoreflect.Message
- func (x *ModelProto) Reset()
- func (x *ModelProto) String() string
type ModelProto_SentencePiece
- func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int)deprecated
- func (*ModelProto_SentencePiece) ExtensionRangeArray() []protoiface.ExtensionRangeV1deprecated
- func (x *ModelProto_SentencePiece) GetPiece() string
- func (x *ModelProto_SentencePiece) GetScore() float32
- func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type
- func (*ModelProto_SentencePiece) ProtoMessage()
- func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message
- func (x *ModelProto_SentencePiece) Reset()
- func (x *ModelProto_SentencePiece) String() string
type ModelProto_SentencePiece_Type
- func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor
- func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type
- func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int)deprecated
- func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber
- func (x ModelProto_SentencePiece_Type) String() string
- func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType
- func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) errordeprecated
type NormalizerSpec
- func (*NormalizerSpec) Descriptor() ([]byte, []int)deprecated
- func (*NormalizerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1deprecated
- func (x *NormalizerSpec) GetAddDummyPrefix() bool
- func (x *NormalizerSpec) GetEscapeWhitespaces() bool
- func (x *NormalizerSpec) GetName() string
- func (x *NormalizerSpec) GetNormalizationRuleTsv() string
- func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte
- func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool
- func (*NormalizerSpec) ProtoMessage()
- func (x *NormalizerSpec) ProtoReflect() protoreflect.Message
- func (x *NormalizerSpec) Reset()
- func (x *NormalizerSpec) String() string
type SelfTestData
- func (*SelfTestData) Descriptor() ([]byte, []int)deprecated
- func (*SelfTestData) ExtensionRangeArray() []protoiface.ExtensionRangeV1deprecated
- func (x *SelfTestData) GetSamples() []*SelfTestData_Sample
- func (*SelfTestData) ProtoMessage()
- func (x *SelfTestData) ProtoReflect() protoreflect.Message
- func (x *SelfTestData) Reset()
- func (x *SelfTestData) String() string
type SelfTestData_Sample
- func (*SelfTestData_Sample) Descriptor() ([]byte, []int)deprecated
- func (x *SelfTestData_Sample) GetExpected() string
- func (x *SelfTestData_Sample) GetInput() string
- func (*SelfTestData_Sample) ProtoMessage()
- func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message
- func (x *SelfTestData_Sample) Reset()
- func (x *SelfTestData_Sample) String() string
type Sentencepiece
- func NewEmptySentencepiece(lowercase bool) Sentencepiece
- func NewSentencepieceFromFile(filename string, lowercase bool) (Sentencepiece, error)
- func (s *Sentencepiece) GetControlWord(word string) (int32, bool)
- func (s *Sentencepiece) GetUnknownIndex() int32
- func (s *Sentencepiece) SetControlWord(word string, index int32)
- func (s *Sentencepiece) SetUnknownIndex(index int32)
- func (s *Sentencepiece) Tokenize(text string) []Token
- func (s *Sentencepiece) TokenizeToIDs(text string) []int32
type Token
type TrainerSpec
- func (*TrainerSpec) Descriptor() ([]byte, []int)deprecated
- func (*TrainerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1deprecated
- func (x *TrainerSpec) GetAcceptLanguage() []string
- func (x *TrainerSpec) GetBosId() int32
- func (x *TrainerSpec) GetBosPiece() string
- func (x *TrainerSpec) GetByteFallback() bool
- func (x *TrainerSpec) GetCharacterCoverage() float32
- func (x *TrainerSpec) GetControlSymbols() []string
- func (x *TrainerSpec) GetEosId() int32
- func (x *TrainerSpec) GetEosPiece() string
- func (x *TrainerSpec) GetHardVocabLimit() bool
- func (x *TrainerSpec) GetInput() []string
- func (x *TrainerSpec) GetInputFormat() string
- func (x *TrainerSpec) GetInputSentenceSize() int32
- func (x *TrainerSpec) GetMaxSentenceLength() int32
- func (x *TrainerSpec) GetMaxSentencepieceLength() int32
- func (x *TrainerSpec) GetMiningSentenceSize() int32deprecated
- func (x *TrainerSpec) GetModelPrefix() string
- func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType
- func (x *TrainerSpec) GetNumSubIterations() int32
- func (x *TrainerSpec) GetNumThreads() int32
- func (x *TrainerSpec) GetPadId() int32
- func (x *TrainerSpec) GetPadPiece() string
- func (x *TrainerSpec) GetRequiredChars() string
- func (x *TrainerSpec) GetSeedSentencepieceSize() int32
- func (x *TrainerSpec) GetSelfTestSampleSize() int32
- func (x *TrainerSpec) GetShrinkingFactor() float32
- func (x *TrainerSpec) GetShuffleInputSentence() bool
- func (x *TrainerSpec) GetSplitByNumber() bool
- func (x *TrainerSpec) GetSplitByUnicodeScript() bool
- func (x *TrainerSpec) GetSplitByWhitespace() bool
- func (x *TrainerSpec) GetSplitDigits() bool
- func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool
- func (x *TrainerSpec) GetTrainingSentenceSize() int32deprecated
- func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool
- func (x *TrainerSpec) GetUnkId() int32
- func (x *TrainerSpec) GetUnkPiece() string
- func (x *TrainerSpec) GetUnkSurface() string
- func (x *TrainerSpec) GetUseAllVocab() bool
- func (x *TrainerSpec) GetUserDefinedSymbols() []string
- func (x *TrainerSpec) GetVocabSize() int32
- func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool
- func (*TrainerSpec) ProtoMessage()
- func (x *TrainerSpec) ProtoReflect() protoreflect.Message
- func (x *TrainerSpec) Reset()
- func (x *TrainerSpec) String() string
type TrainerSpec_ModelType
- func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor
- func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType
- func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int)deprecated
- func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber
- func (x TrainerSpec_ModelType) String() string
- func (TrainerSpec_ModelType) Type() protoreflect.EnumType
- func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) errordeprecated

Constants ¶

View Source

const (
	Default_TrainerSpec_ModelType                  = TrainerSpec_UNIGRAM
	Default_TrainerSpec_VocabSize                  = int32(8000)
	Default_TrainerSpec_SelfTestSampleSize         = int32(0)
	Default_TrainerSpec_CharacterCoverage          = float32(0.9994999766349792)
	Default_TrainerSpec_InputSentenceSize          = int32(0)
	Default_TrainerSpec_ShuffleInputSentence       = bool(true)
	Default_TrainerSpec_SeedSentencepieceSize      = int32(1000000)
	Default_TrainerSpec_ShrinkingFactor            = float32(0.75)
	Default_TrainerSpec_MaxSentenceLength          = int32(4192)
	Default_TrainerSpec_NumThreads                 = int32(16)
	Default_TrainerSpec_NumSubIterations           = int32(2)
	Default_TrainerSpec_MaxSentencepieceLength     = int32(16)
	Default_TrainerSpec_SplitByUnicodeScript       = bool(true)
	Default_TrainerSpec_SplitByNumber              = bool(true)
	Default_TrainerSpec_SplitByWhitespace          = bool(true)
	Default_TrainerSpec_TreatWhitespaceAsSuffix    = bool(false)
	Default_TrainerSpec_SplitDigits                = bool(false)
	Default_TrainerSpec_ByteFallback               = bool(false)
	Default_TrainerSpec_VocabularyOutputPieceScore = bool(true)
	Default_TrainerSpec_HardVocabLimit             = bool(true)
	Default_TrainerSpec_UseAllVocab                = bool(false)
	Default_TrainerSpec_UnkId                      = int32(0)
	Default_TrainerSpec_BosId                      = int32(1)
	Default_TrainerSpec_EosId                      = int32(2)
	Default_TrainerSpec_PadId                      = int32(-1)
	Default_TrainerSpec_UnkPiece                   = string("<unk>")
	Default_TrainerSpec_BosPiece                   = string("<s>")
	Default_TrainerSpec_EosPiece                   = string("</s>")
	Default_TrainerSpec_PadPiece                   = string("<pad>")
	Default_TrainerSpec_UnkSurface                 = string(" ⁇ ")
	Default_TrainerSpec_TrainExtremelyLargeCorpus  = bool(false)
)

Default values for TrainerSpec fields.

View Source

const (
	Default_NormalizerSpec_AddDummyPrefix         = bool(true)
	Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true)
	Default_NormalizerSpec_EscapeWhitespaces      = bool(true)
)

Default values for NormalizerSpec fields.

View Source

const (
	Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL
)

Default values for ModelProto_SentencePiece fields.

Variables ¶

View Source

var (
	TrainerSpec_ModelType_name = map[int32]string{
		1: "UNIGRAM",
		2: "BPE",
		3: "WORD",
		4: "CHAR",
	}
	TrainerSpec_ModelType_value = map[string]int32{
		"UNIGRAM": 1,
		"BPE":     2,
		"WORD":    3,
		"CHAR":    4,
	}
)

Enum value maps for TrainerSpec_ModelType.

View Source

var (
	ModelProto_SentencePiece_Type_name = map[int32]string{
		1: "NORMAL",
		2: "UNKNOWN",
		3: "CONTROL",
		4: "USER_DEFINED",
		6: "BYTE",
		5: "UNUSED",
	}
	ModelProto_SentencePiece_Type_value = map[string]int32{
		"NORMAL":       1,
		"UNKNOWN":      2,
		"CONTROL":      3,
		"USER_DEFINED": 4,
		"BYTE":         6,
		"UNUSED":       5,
	}
)

Enum value maps for ModelProto_SentencePiece_Type.

View Source

var File_sentencepiece_sentencepiece_model_proto protoreflect.FileDescriptor

Functions ¶

This section is empty.

Types ¶

type ModelProto ¶

type ModelProto struct {

	// Sentence pieces with scores.
	Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"`
	// Spec used to generate this model file.
	TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"`
	// Spec for text normalization.
	NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"`
	// Stores sample input and its expected segmentation to verify the model.
	SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"`
	// Spec for text de-normalization.
	DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"`
	// contains filtered or unexported fields
}

ModelProto stores model parameters. SentencePieceProcessor is supposed to be self-contained. All settings/parameters which may change the behavior must be encoded in ModelProto.

func (*ModelProto) Descriptor deprecated

func (*ModelProto) Descriptor() ([]byte, []int)

Deprecated: Use ModelProto.ProtoReflect.Descriptor instead.

func (*ModelProto) ExtensionRangeArray deprecated

func (*ModelProto) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use ModelProto.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*ModelProto) GetDenormalizerSpec ¶

func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec

func (*ModelProto) GetNormalizerSpec ¶

func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec

func (*ModelProto) GetPieces ¶

func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece

func (*ModelProto) GetSelfTestData ¶

func (x *ModelProto) GetSelfTestData() *SelfTestData

func (*ModelProto) GetTrainerSpec ¶

func (x *ModelProto) GetTrainerSpec() *TrainerSpec

func (*ModelProto) ProtoMessage ¶

func (*ModelProto) ProtoMessage()

func (*ModelProto) ProtoReflect ¶

func (x *ModelProto) ProtoReflect() protoreflect.Message

func (*ModelProto) Reset ¶

func (x *ModelProto) Reset()

func (*ModelProto) String ¶

func (x *ModelProto) String() string

type ModelProto_SentencePiece ¶

type ModelProto_SentencePiece struct {
	Piece *string                        `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty.
	Score *float32                       `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"`
	Type  *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"`
	// contains filtered or unexported fields
}

func (*ModelProto_SentencePiece) Descriptor deprecated

func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int)

Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead.

func (*ModelProto_SentencePiece) ExtensionRangeArray deprecated

func (*ModelProto_SentencePiece) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*ModelProto_SentencePiece) GetPiece ¶

func (x *ModelProto_SentencePiece) GetPiece() string

func (*ModelProto_SentencePiece) GetScore ¶

func (x *ModelProto_SentencePiece) GetScore() float32

func (*ModelProto_SentencePiece) GetType ¶

func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type

func (*ModelProto_SentencePiece) ProtoMessage ¶

func (*ModelProto_SentencePiece) ProtoMessage()

func (*ModelProto_SentencePiece) ProtoReflect ¶

func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message

func (*ModelProto_SentencePiece) Reset ¶

func (x *ModelProto_SentencePiece) Reset()

func (*ModelProto_SentencePiece) String ¶

func (x *ModelProto_SentencePiece) String() string

type ModelProto_SentencePiece_Type ¶

type ModelProto_SentencePiece_Type int32

const (
	ModelProto_SentencePiece_NORMAL       ModelProto_SentencePiece_Type = 1 // normal symbol
	ModelProto_SentencePiece_UNKNOWN      ModelProto_SentencePiece_Type = 2 // unknown symbol. only <unk> for now.
	ModelProto_SentencePiece_CONTROL      ModelProto_SentencePiece_Type = 3 // control symbols. </s>, <s>, <2ja> etc.
	ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols.
	// Typical usage of USER_DEFINED symbol
	// is placeholder.
	ModelProto_SentencePiece_BYTE   ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true.
	ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used.
)

func (ModelProto_SentencePiece_Type) Descriptor ¶

func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor

func (ModelProto_SentencePiece_Type) Enum ¶

func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type

func (ModelProto_SentencePiece_Type) EnumDescriptor deprecated

func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int)

Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead.

func (ModelProto_SentencePiece_Type) Number ¶

func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber

func (ModelProto_SentencePiece_Type) String ¶

func (x ModelProto_SentencePiece_Type) String() string

func (ModelProto_SentencePiece_Type) Type ¶

func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType

func (*ModelProto_SentencePiece_Type) UnmarshalJSON deprecated

func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error

Deprecated: Do not use.

type NormalizerSpec ¶

type NormalizerSpec struct {

	// name of normalization rule.
	Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
	// Pre-compiled normalization rule created by
	// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
	// Usually this field is set by Builder::GetNormalizerSpec() method.
	PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"`
	// Adds dummy whitespace at the beginning of text in order to
	// treat "world" in "world" and "hello world" in the same way.
	AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"`
	// Removes leading, trailing, and duplicate internal whitespace.
	RemoveExtraWhitespaces *bool `` /* 129-byte string literal not displayed */
	// Replaces whitespace with meta symbol.
	// This field must be true to train sentence piece model.
	EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"`
	// Custom normalization rule file in TSV format.
	// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
	// This field is only used in SentencePieceTrainer::Train() method, which
	// compiles the rule into the binary rule stored in `precompiled_charsmap`.
	NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"`
	// contains filtered or unexported fields
}

NormalizerSpec encodes a various parameters for string normalizaiton

func (*NormalizerSpec) Descriptor deprecated

func (*NormalizerSpec) Descriptor() ([]byte, []int)

Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead.

func (*NormalizerSpec) ExtensionRangeArray deprecated

func (*NormalizerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*NormalizerSpec) GetAddDummyPrefix ¶

func (x *NormalizerSpec) GetAddDummyPrefix() bool

func (*NormalizerSpec) GetEscapeWhitespaces ¶

func (x *NormalizerSpec) GetEscapeWhitespaces() bool

func (*NormalizerSpec) GetName ¶

func (x *NormalizerSpec) GetName() string

func (*NormalizerSpec) GetNormalizationRuleTsv ¶

func (x *NormalizerSpec) GetNormalizationRuleTsv() string

func (*NormalizerSpec) GetPrecompiledCharsmap ¶

func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte

func (*NormalizerSpec) GetRemoveExtraWhitespaces ¶

func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool

func (*NormalizerSpec) ProtoMessage ¶

func (*NormalizerSpec) ProtoMessage()

func (*NormalizerSpec) ProtoReflect ¶

func (x *NormalizerSpec) ProtoReflect() protoreflect.Message

func (*NormalizerSpec) Reset ¶

func (x *NormalizerSpec) Reset()

func (*NormalizerSpec) String ¶

func (x *NormalizerSpec) String() string

type SelfTestData ¶

type SelfTestData struct {
	Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"`
	// contains filtered or unexported fields
}

Proto to store samples for self-testing.

func (*SelfTestData) Descriptor deprecated

func (*SelfTestData) Descriptor() ([]byte, []int)

Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead.

func (*SelfTestData) ExtensionRangeArray deprecated

func (*SelfTestData) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use SelfTestData.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*SelfTestData) GetSamples ¶

func (x *SelfTestData) GetSamples() []*SelfTestData_Sample

func (*SelfTestData) ProtoMessage ¶

func (*SelfTestData) ProtoMessage()

func (*SelfTestData) ProtoReflect ¶

func (x *SelfTestData) ProtoReflect() protoreflect.Message

func (*SelfTestData) Reset ¶

func (x *SelfTestData) Reset()

func (*SelfTestData) String ¶

func (x *SelfTestData) String() string

type SelfTestData_Sample ¶

type SelfTestData_Sample struct {
	Input    *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"`
	Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"`
	// contains filtered or unexported fields
}

func (*SelfTestData_Sample) Descriptor deprecated

func (*SelfTestData_Sample) Descriptor() ([]byte, []int)

Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead.

func (*SelfTestData_Sample) GetExpected ¶

func (x *SelfTestData_Sample) GetExpected() string

func (*SelfTestData_Sample) GetInput ¶

func (x *SelfTestData_Sample) GetInput() string

func (*SelfTestData_Sample) ProtoMessage ¶

func (*SelfTestData_Sample) ProtoMessage()

func (*SelfTestData_Sample) ProtoReflect ¶

func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message

func (*SelfTestData_Sample) Reset ¶

func (x *SelfTestData_Sample) Reset()

func (*SelfTestData_Sample) String ¶

func (x *SelfTestData_Sample) String() string

type Sentencepiece ¶

type Sentencepiece struct {
	// contains filtered or unexported fields
}

Sentencepiece holds the model

func NewEmptySentencepiece ¶

func NewEmptySentencepiece(lowercase bool) Sentencepiece

NewEmptySentencepiece creates an empty sentencepiece model

func NewSentencepieceFromFile ¶

func NewSentencepieceFromFile(filename string, lowercase bool) (Sentencepiece, error)

NewSentencepieceFromFile creates sentencepiece from file.

func (*Sentencepiece) GetControlWord ¶

func (s *Sentencepiece) GetControlWord(word string) (int32, bool)

GetControlWord gets the index for the given control word

func (*Sentencepiece) GetUnknownIndex ¶

func (s *Sentencepiece) GetUnknownIndex() int32

GetUnknownIndex gets the index of the unknown id

func (*Sentencepiece) SetControlWord ¶

func (s *Sentencepiece) SetControlWord(word string, index int32)

SetControlWord sets the index for the given control word

func (*Sentencepiece) SetUnknownIndex ¶

func (s *Sentencepiece) SetUnknownIndex(index int32)

SetUnknownIndex sets the index for the unknown id

func (*Sentencepiece) Tokenize ¶

func (s *Sentencepiece) Tokenize(text string) []Token

Tokenize tokenizes text into pieces

func (*Sentencepiece) TokenizeToIDs ¶

func (s *Sentencepiece) TokenizeToIDs(text string) []int32

TokenizeToIDs tokenizes text into ids from the vocab

type Token ¶

type Token struct {
	ID   int32
	Text string
}

Token holds a unit of a tokenized word

type TrainerSpec ¶

type TrainerSpec struct {

	///////////////////////////////////////////////////////////////////
	// General parameters
	//
	// Input corpus files.
	//  Trainer accepts the following two formats:
	//  A) Monolingual: plain text, one sentence per line.
	//  B) Bilingual:   TSV, source sentence <tab> target sentence
	//  When bilingual data is passed, shared vocabulary model is built.
	//  Note that the input file must be raw corpus, not a preprocessed corpus.
	//  Trainer only loads the first `input_sentence_size` sentences specified
	//  with this parameter.
	Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"`
	// Input corpus format:
	// "text": one-sentence-per-line text format (default)
	// "tsv":  sentence <tab> freq
	InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"`
	// Output model file prefix.
	// <model_prefix>.model and <model_prefix>.vocab are generated.
	ModelPrefix *string                `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"`
	ModelType   *TrainerSpec_ModelType `` /* 129-byte string literal not displayed */
	// Vocabulary size. 8k is the default size.
	VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"`
	// List of the languages this model can accept.
	// Since the model is language-agnostic, this field is used as a reference.
	AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"`
	// Size of self-test samples, which are encoded in the model file.
	SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Training parameters.
	//
	// Uses characters which cover the corpus with the ratio of `chars_coverage`.
	// This parameter determines the set of basic Alphabet of sentence piece.
	// 1.0 - `chars_coverage` characters are treated as UNK.
	// See also required_chars field.
	CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"`
	// Maximum size of sentences the trainer loads from `input` parameter.
	// Trainer simply loads the `input` files in sequence.
	// It is better to shuffle the input corpus randomly.
	InputSentenceSize    *int32 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"`
	ShuffleInputSentence *bool  `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"`
	// Maximum size of sentences to make seed sentence pieces.
	// Extended suffix array is constructed to extract frequent
	// sub-strings from the corpus. This uses 20N working space,
	// where N is the size of corpus.
	//
	// Deprecated: Do not use.
	MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"`
	// Maximum size of sentences to train sentence pieces.
	//
	// Deprecated: Do not use.
	TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"`
	// The size of seed sentencepieces.
	// `seed_sentencepiece_size` must be larger than `vocab_size`.
	SeedSentencepieceSize *int32 `` /* 133-byte string literal not displayed */
	// In every EM sub-iterations, keeps top
	// `shrinking_factor` * `current sentencepieces size` with respect to
	// the loss of the sentence piece. This value should be smaller than 1.0.
	ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"`
	// The maximum sentence length in byte. The sentences with the length
	// larger than `max_sentence_length` is simply ignored.
	// Longer input tends to bring the following risks:
	//  * Overflow during EM training (unigram language model only)
	//  * Performance drop because of O(n log n) cost in BPE.
	MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"`
	// Number of threads in the training.
	NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"`
	// Number of EM sub iterations.
	NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"`
	///////////////////////////////////////////////////////////////////
	// SentencePiece parameters which control the shapes of sentence piece.
	//
	// Maximum length of sentencepiece.
	MaxSentencepieceLength *int32 `` /* 131-byte string literal not displayed */
	// Uses Unicode script to split sentence pieces.
	// When `split_by_unicode_script` is true, we do not allow sentence piece to
	// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
	// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
	// as one script type, since Japanese word can consist of multiple scripts.
	// This exception is always applied regardless of the accept-language
	// parameter.
	SplitByUnicodeScript *bool `` /* 126-byte string literal not displayed */
	// When `split_by_number` is true, put a boundary between number and
	// non-number transition. If we want to treat "F1" is one token, set this flag
	// to be false.
	SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"`
	// Use a white space to split sentence pieces.
	// When `split_by_whitespace` is false, we may have the piece containing
	// a white space in the middle. e.g., "in_the".
	SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"`
	// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
	// hello_. When `treat_whitespace_as_suffix` is true,
	// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
	// of sentence.
	TreatWhitespaceAsSuffix *bool `` /* 135-byte string literal not displayed */
	// Split all digits (0-9) into separate pieces.
	SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Vocabulary management
	//
	// Defines control symbols used as an indicator to
	// change the behavior of the decoder. <s> and </s> are pre-defined.
	// We can use this field to encode various meta information,
	// including language indicator in multilingual model.
	// These symbols are not visible to users, but visible to
	// the decoder. Note that when the input sentence contains control symbols,
	// they are not treated as one token, but segmented into normal pieces.
	// Control symbols must be inserted independently from the segmentation.
	ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"`
	// Defines user defined symbols.
	// These symbols are added with extremely high score
	// so they are always treated as one unique symbol in any context.
	// Typical usage of user_defined_symbols is placeholder for named entities.
	UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"`
	// Defines required characters. Each UTF8 character in this string is included
	// in the character set regardless of character_coverage value. Unlike
	// user_defined_symbols, these characters have scores based on the frequency
	// on input sentences, and the model can form subwords using characters
	// in this field.
	RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"`
	// Decomposes unknown pieces into UTF-8 bytes.
	ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"`
	// When creating the vocabulary file, defines whether or not to additionally
	// output the score for each piece.
	VocabularyOutputPieceScore *bool `` /* 144-byte string literal not displayed */
	// `vocab_size` is treated as hard limit. Crash if
	// the model can not produce the vocab of size `vocab_size`,
	// When `hard_vocab_limit` is false, vocab_size is treated
	// as soft limit. Note that when model_type=char,
	// always assumes hard_vocab_limit = false.
	HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"`
	// use all symbols for vocab extraction. This flag is valid
	// if model type is either CHAR or WORD
	UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Reserved special meta tokens.
	// * -1 is not used.
	// * unk_id must not be -1.
	// Id must starts with 0 and be contigous.
	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
	PadId    *int32  `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // <pad> (padding)
	UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=<unk>" json:"unk_piece,omitempty"`
	BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=<s>" json:"bos_piece,omitempty"`
	EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=</s>" json:"eos_piece,omitempty"`
	PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=<pad>" json:"pad_piece,omitempty"`
	// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
	// since this character can be useful both for user and
	// developer. We can easily figure out that <unk> is emitted.
	UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"`
	// Increase bit depth to allow unigram model training on large
	// (>10M sentences) corpora. A Side-effect of enabling this flag
	// is increased memory usage.
	TrainExtremelyLargeCorpus *bool `` /* 141-byte string literal not displayed */
	// contains filtered or unexported fields
}

BEGIN GOOGLE-INTERNAL LINT.IfChange END GOOGLE-INTERNAL TrainerSpec encodes a various parameters for SentencePiece training.

func (*TrainerSpec) Descriptor deprecated

func (*TrainerSpec) Descriptor() ([]byte, []int)

Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead.

func (*TrainerSpec) ExtensionRangeArray deprecated

func (*TrainerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use TrainerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*TrainerSpec) GetAcceptLanguage ¶

func (x *TrainerSpec) GetAcceptLanguage() []string

func (*TrainerSpec) GetBosId ¶

func (x *TrainerSpec) GetBosId() int32

func (*TrainerSpec) GetBosPiece ¶

func (x *TrainerSpec) GetBosPiece() string

func (*TrainerSpec) GetByteFallback ¶

func (x *TrainerSpec) GetByteFallback() bool

func (*TrainerSpec) GetCharacterCoverage ¶

func (x *TrainerSpec) GetCharacterCoverage() float32

func (*TrainerSpec) GetControlSymbols ¶

func (x *TrainerSpec) GetControlSymbols() []string

func (*TrainerSpec) GetEosId ¶

func (x *TrainerSpec) GetEosId() int32

func (*TrainerSpec) GetEosPiece ¶

func (x *TrainerSpec) GetEosPiece() string

func (*TrainerSpec) GetHardVocabLimit ¶

func (x *TrainerSpec) GetHardVocabLimit() bool

func (*TrainerSpec) GetInput ¶

func (x *TrainerSpec) GetInput() []string

func (*TrainerSpec) GetInputFormat ¶

func (x *TrainerSpec) GetInputFormat() string

func (*TrainerSpec) GetInputSentenceSize ¶

func (x *TrainerSpec) GetInputSentenceSize() int32

func (*TrainerSpec) GetMaxSentenceLength ¶

func (x *TrainerSpec) GetMaxSentenceLength() int32

func (*TrainerSpec) GetMaxSentencepieceLength ¶

func (x *TrainerSpec) GetMaxSentencepieceLength() int32

func (*TrainerSpec) GetMiningSentenceSize deprecated

func (x *TrainerSpec) GetMiningSentenceSize() int32

Deprecated: Do not use.

func (*TrainerSpec) GetModelPrefix ¶

func (x *TrainerSpec) GetModelPrefix() string

func (*TrainerSpec) GetModelType ¶

func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType

func (*TrainerSpec) GetNumSubIterations ¶

func (x *TrainerSpec) GetNumSubIterations() int32

func (*TrainerSpec) GetNumThreads ¶

func (x *TrainerSpec) GetNumThreads() int32

func (*TrainerSpec) GetPadId ¶

func (x *TrainerSpec) GetPadId() int32

func (*TrainerSpec) GetPadPiece ¶

func (x *TrainerSpec) GetPadPiece() string

func (*TrainerSpec) GetRequiredChars ¶

func (x *TrainerSpec) GetRequiredChars() string

func (*TrainerSpec) GetSeedSentencepieceSize ¶

func (x *TrainerSpec) GetSeedSentencepieceSize() int32

func (*TrainerSpec) GetSelfTestSampleSize ¶

func (x *TrainerSpec) GetSelfTestSampleSize() int32

func (*TrainerSpec) GetShrinkingFactor ¶

func (x *TrainerSpec) GetShrinkingFactor() float32

func (*TrainerSpec) GetShuffleInputSentence ¶

func (x *TrainerSpec) GetShuffleInputSentence() bool

func (*TrainerSpec) GetSplitByNumber ¶

func (x *TrainerSpec) GetSplitByNumber() bool

func (*TrainerSpec) GetSplitByUnicodeScript ¶

func (x *TrainerSpec) GetSplitByUnicodeScript() bool

func (*TrainerSpec) GetSplitByWhitespace ¶

func (x *TrainerSpec) GetSplitByWhitespace() bool

func (*TrainerSpec) GetSplitDigits ¶

func (x *TrainerSpec) GetSplitDigits() bool

func (*TrainerSpec) GetTrainExtremelyLargeCorpus ¶

func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool

func (*TrainerSpec) GetTrainingSentenceSize deprecated

func (x *TrainerSpec) GetTrainingSentenceSize() int32

Deprecated: Do not use.

func (*TrainerSpec) GetTreatWhitespaceAsSuffix ¶

func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool

func (*TrainerSpec) GetUnkId ¶

func (x *TrainerSpec) GetUnkId() int32

func (*TrainerSpec) GetUnkPiece ¶

func (x *TrainerSpec) GetUnkPiece() string

func (*TrainerSpec) GetUnkSurface ¶

func (x *TrainerSpec) GetUnkSurface() string

func (*TrainerSpec) GetUseAllVocab ¶

func (x *TrainerSpec) GetUseAllVocab() bool

func (*TrainerSpec) GetUserDefinedSymbols ¶

func (x *TrainerSpec) GetUserDefinedSymbols() []string

func (*TrainerSpec) GetVocabSize ¶

func (x *TrainerSpec) GetVocabSize() int32

func (*TrainerSpec) GetVocabularyOutputPieceScore ¶

func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool

func (*TrainerSpec) ProtoMessage ¶

func (*TrainerSpec) ProtoMessage()

func (*TrainerSpec) ProtoReflect ¶

func (x *TrainerSpec) ProtoReflect() protoreflect.Message

func (*TrainerSpec) Reset ¶

func (x *TrainerSpec) Reset()

func (*TrainerSpec) String ¶

func (x *TrainerSpec) String() string

type TrainerSpec_ModelType ¶

type TrainerSpec_ModelType int32

Model type. only have UNIGRAM now.

const (
	TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm
	TrainerSpec_BPE     TrainerSpec_ModelType = 2 // Byte Pair Encoding
	TrainerSpec_WORD    TrainerSpec_ModelType = 3 // Delimitered by whitespace.
	TrainerSpec_CHAR    TrainerSpec_ModelType = 4 // tokenizes into character sequence
)

func (TrainerSpec_ModelType) Descriptor ¶

func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor

func (TrainerSpec_ModelType) Enum ¶

func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType

func (TrainerSpec_ModelType) EnumDescriptor deprecated

func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int)

Deprecated: Use TrainerSpec_ModelType.Descriptor instead.

func (TrainerSpec_ModelType) Number ¶

func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber

func (TrainerSpec_ModelType) String ¶

func (x TrainerSpec_ModelType) String() string

func (TrainerSpec_ModelType) Type ¶

func (TrainerSpec_ModelType) Type() protoreflect.EnumType

func (*TrainerSpec_ModelType) UnmarshalJSON deprecated

func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error

Deprecated: Do not use.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL