processor

package
v0.2.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 25, 2023 License: Apache-2.0 Imports: 7 Imported by: 18

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BertProcessing

type BertProcessing struct {
	// contains filtered or unexported fields
}

func NewBertProcessing

func NewBertProcessing(sep, cls PostToken) (retVal *BertProcessing)

func (*BertProcessing) AddedTokens

func (bp *BertProcessing) AddedTokens(isPair bool) (retVal int)

func (*BertProcessing) Process

func (bp *BertProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) (retVal *tokenizer.Encoding)

Process post-processes input encoding(s) by adding special tokens if specifying.

type ByteLevelProcessing

type ByteLevelProcessing struct {
	// contains filtered or unexported fields
}

func NewByteLevelProcessing

func NewByteLevelProcessing(pretok *pretokenizer.ByteLevel) (retVal *ByteLevelProcessing)

func (*ByteLevelProcessing) AddedTokens

func (blp *ByteLevelProcessing) AddedTokens(isPair bool) (retVal int)

func (*ByteLevelProcessing) Process

func (blp *ByteLevelProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) (retVal *tokenizer.Encoding)

type Piece added in v0.2.0

type Piece interface {
	// ExtractId(s string) Piece
	WithTypeId(typeId int)
}

func NewPiece added in v0.2.0

func NewPiece(s string) (Piece, error)

type PostToken

type PostToken struct {
	Value string
	Id    int
}

type RobertaProcessing

type RobertaProcessing struct {
	// contains filtered or unexported fields
}

RobertaProcessing is a post post processor for Roberta model

func DefaultRobertaProcessing

func DefaultRobertaProcessing() *RobertaProcessing

DefaultRobertaProcessing creates a RobertaProcessing with default values

func NewRobertaProcessing

func NewRobertaProcessing(sep, cls PostToken, trimOffsets bool, addPrefixSpace bool) *RobertaProcessing

func (*RobertaProcessing) AddPrefixSpace

func (rp *RobertaProcessing) AddPrefixSpace(addPrefixSpace bool)

AddPrefixSpace set whether the processor will add a prefix space

func (*RobertaProcessing) AddedTokens

func (rp *RobertaProcessing) AddedTokens(isPair bool) int

func (*RobertaProcessing) Process

func (rp *RobertaProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding

Process post-processes input encoding(s) by adding special tokens if instructed to do so.

Specifically, if addSpecialToken=true, it will add special tokens patterns - Single encoding: <s> Sequence </s> - Pair encoding: <s> SequenceA </s> </s> SequenceB </s>

func (*RobertaProcessing) TrimOffsets

func (rp *RobertaProcessing) TrimOffsets(trimOffsets bool)

TrimOffsets set whether the processor will trim offsets

type Sequence added in v0.2.0

type Sequence struct {
	// contains filtered or unexported fields
}

func NewSequence added in v0.2.0

func NewSequence(processors []tokenizer.PostProcessor) *Sequence

func (*Sequence) AddedTokens added in v0.2.0

func (seq *Sequence) AddedTokens(isPair bool) (retVal int)

func (*Sequence) Process added in v0.2.0

func (seq *Sequence) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) (retVal *tokenizer.Encoding)

type SequenceEnum added in v0.2.0

type SequenceEnum int
const (
	A SequenceEnum = iota
	B
)

type SequencePiece added in v0.2.0

type SequencePiece struct {
	Id     SequenceEnum `json:"id"`
	TypeId int          `json:"type_id"`
}

func NewSequencePiece added in v0.2.0

func NewSequencePiece(id string, typeId int) *SequencePiece

func (*SequencePiece) WithTypeId added in v0.2.0

func (p *SequencePiece) WithTypeId(v int)

Implement Piece for SequencePiece: ----------------------------------

type SpecialToken added in v0.2.0

type SpecialToken struct {
	// A unique id used to identify this SpecialToken in the template
	Id string

	// The list of associated ids
	Ids []int

	// The list of associated tokens
	Tokens []string
}

Represents a bunch of tokens to be used in a template. Usually, special tokens have only one associated id/token but in some cases, it might be interesting to have multiple ids/tokens.

func NewSpecialToken added in v0.2.0

func NewSpecialToken(id string, ids []int, tokens []string) *SpecialToken

func NewSpecialTokenFrom added in v0.2.0

func NewSpecialTokenFrom(s string, id int) *SpecialToken

type SpecialTokenPiece added in v0.2.0

type SpecialTokenPiece struct {
	Id     string `json:"id"`
	TypeId int    `json:"type_id"`
}

func NewSpecialTokenPiece added in v0.2.0

func NewSpecialTokenPiece(id string, typeId int) *SpecialTokenPiece

func (*SpecialTokenPiece) WithTypeId added in v0.2.0

func (p *SpecialTokenPiece) WithTypeId(v int)

type Template added in v0.2.0

type Template []Piece

func NewTemplate added in v0.2.0

func NewTemplate(v interface{}) (Template, error)

func NewTemplateFromMulti added in v0.2.0

func NewTemplateFromMulti(parts []string) (Template, error)

func NewTemplateFromOne added in v0.2.0

func NewTemplateFromOne(s string) (Template, error)

type TemplateProcessing added in v0.2.0

type TemplateProcessing struct {
	Single        Template
	Pair          Template
	AddedSingle   int
	AddedPair     int
	SpecialTokens *Tokens
}

/ This PostProcessor takes care of processing each input `Encoding` by applying / the corresponding template, before merging them in the final Encoding. / / A `Template` is actually a sequence of `Piece` that will be / concatenated together in the given order. Each `Piece` represents either / one of the input `Encoding` or a `SpecialToken`. / / ## Example / ``` / # use tokenizers::processors::template::TemplateProcessing; / let template = TemplateProcessing::builder() / .try_single("[CLS] $A [SEP]").unwrap() / .try_pair("[CLS] $A [SEP] $B:1 [SEP]:1").unwrap() / .special_tokens(vec![("[CLS]", 1), ("[SEP]", 0)]) / .build() / .unwrap(); / ``` /

func DefaultTemplateProcessing added in v0.2.0

func DefaultTemplateProcessing() *TemplateProcessing

func NewTemplateProcessing added in v0.2.0

func NewTemplateProcessing(single, pair Template, specialTokens *Tokens) *TemplateProcessing

func NewTemplateProcessingFrom added in v0.2.0

func NewTemplateProcessingFrom(t *TemplateProcessingDeserializer) *TemplateProcessing

func (*TemplateProcessing) AddedTokens added in v0.2.0

func (tp *TemplateProcessing) AddedTokens(isPair bool) int

func (*TemplateProcessing) ApplyTemplate added in v0.2.0

func (tp *TemplateProcessing) ApplyTemplate(template []Piece, encodings []tokenizer.Encoding, addSpecialTokens bool) []tokenizer.Encoding

func (*TemplateProcessing) Builder added in v0.2.0

func (*TemplateProcessing) Process added in v0.2.0

func (tp *TemplateProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding

type TemplateProcessingBuilder added in v0.2.0

type TemplateProcessingBuilder struct {
	*TemplateProcessing
}

func (*TemplateProcessingBuilder) Build added in v0.2.0

func (*TemplateProcessingBuilder) DefaultAdded added in v0.2.0

func (b *TemplateProcessingBuilder) DefaultAdded(isSingle bool) int

func (*TemplateProcessingBuilder) NewPair added in v0.2.0

func (b *TemplateProcessingBuilder) NewPair(v interface{})

func (*TemplateProcessingBuilder) NewSingle added in v0.2.0

func (b *TemplateProcessingBuilder) NewSingle(v interface{})

func (*TemplateProcessingBuilder) NewSpecialTokens added in v0.2.0

func (b *TemplateProcessingBuilder) NewSpecialTokens(tokens []tokenizer.Token)

func (*TemplateProcessingBuilder) Validate added in v0.2.0

func (b *TemplateProcessingBuilder) Validate() error

type TemplateProcessingDeserializer added in v0.2.0

type TemplateProcessingDeserializer struct {
	Single        Template
	Pair          Template
	SpecialTokens *Tokens
}

type Tokens added in v0.2.0

type Tokens struct {
	TokenMap map[string]SpecialToken // NOTE. HF is an ordered map
	// contains filtered or unexported fields
}

A bunch of [`SpecialToken`] represented by their ID.

func DefaultTokens added in v0.2.0

func DefaultTokens() *Tokens

func NewTokens added in v0.2.0

func NewTokens(toks []tokenizer.Token) *Tokens

func NewTokensFrom added in v0.2.0

func NewTokensFrom(toks []SpecialToken) *Tokens

func NewTokensFromMap added in v0.2.0

func NewTokensFromMap(m map[string]SpecialToken) *Tokens

func (*Tokens) GetItemByKey added in v0.2.0

func (t *Tokens) GetItemByKey(id string) (SpecialToken, bool)

func (*Tokens) GetItemByOrder added in v0.2.0

func (t *Tokens) GetItemByOrder(index int) (SpecialToken, bool)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL