decoder

package
v0.2.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 25, 2023 License: Apache-2.0 Imports: 5 Imported by: 2

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BpeDecoder

type BpeDecoder struct {
	*DecoderBase
	// contains filtered or unexported fields
}

Allows decoding Original BPE by joining all the tokens and then replacing the suffix used to identify end-of-words by whitespaces

func DefaultBpeDecoder

func DefaultBpeDecoder() *BpeDecoder

DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)

func NewBpeDecoder

func NewBpeDecoder(suffix string) *BpeDecoder

NewBpeDecoder creates a new BpeDecoder

func (*BpeDecoder) DecodeChain added in v0.2.0

func (bd *BpeDecoder) DecodeChain(tokens []string) []string

type ByteFallback added in v0.2.0

type ByteFallback struct {
	*DecoderBase
	// contains filtered or unexported fields
}

func NewByteFallback added in v0.2.0

func NewByteFallback() *ByteFallback

func (*ByteFallback) DecodeChain added in v0.2.0

func (d *ByteFallback) DecodeChain(tokens []string) []string

type CTC added in v0.2.0

type CTC struct {
	*DecoderBase

	PadToken           string // the pad token used by CTC to delimit a new token
	WordDelimiterToken string // the word delimiter token. It will be replace by a `<space>`
	Cleanup            bool   // whether to cleanup some tokenization artifacts, mainly spaces before punctuation and some abbreviated english forms
}

func DefaultCTC added in v0.2.0

func DefaultCTC() *CTC

func NewCTC added in v0.2.0

func NewCTC(padToken string, wordDelimiterToken string, cleanup bool) *CTC

func (*CTC) DecodeChain added in v0.2.0

func (d *CTC) DecodeChain(tokens []string) []string

type DecoderBase added in v0.2.0

type DecoderBase struct {
	tokenizer.Decoder // Embed Decoder interface here so that a struct that embed `DecoderBase` can overwrite it method.
}

func (*DecoderBase) Decode added in v0.2.0

func (d *DecoderBase) Decode(tokens []string) string

func (*DecoderBase) DecodeChain added in v0.2.0

func (d *DecoderBase) DecodeChain(tokens []string) []string

NOTE. this method here for validating only! It will be overloaded if a struct embed `DecoderBase` overwrites it.

type Fuse added in v0.2.0

type Fuse struct {
	*DecoderBase
}

Fuse constructs Fuse decoder It's simply fuses all tokens into one big string.

func NewFuse added in v0.2.0

func NewFuse() *Fuse

func (*Fuse) DecodeChain added in v0.2.0

func (f *Fuse) DecodeChain(tokens []string) []string

type Sequence added in v0.2.0

type Sequence struct {
	*DecoderBase
	// contains filtered or unexported fields
}

func NewSequence added in v0.2.0

func NewSequence(decoders []tokenizer.Decoder) *Sequence

func (*Sequence) DecodeChain added in v0.2.0

func (d *Sequence) DecodeChain(tokens []string) []string

Decode implements `tokenizer.Decoder` interface.

type Strip added in v0.2.0

type Strip struct {
	*DecoderBase

	Content string
	Start   int
	Stop    int
}

func NewStrip added in v0.2.0

func NewStrip(content string, start, stop int) *Strip

func (*Strip) DecodeChain added in v0.2.0

func (d *Strip) DecodeChain(tokens []string) []string

type WordPieceDecoder

type WordPieceDecoder struct {
	*DecoderBase
	// contains filtered or unexported fields
}

WordPieceDecoder takes care of decoding a list of wordpiece tokens back into a readable string.

func DefaultWordpieceDecoder

func DefaultWordpieceDecoder() *WordPieceDecoder

DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)

func NewWordPieceDecoder

func NewWordPieceDecoder(prefix string, cleanup bool) *WordPieceDecoder

NewBpeDecoder creates a new BpeDecoder

func (*WordPieceDecoder) Cleanup added in v0.2.0

func (wd *WordPieceDecoder) Cleanup(tok string) string

func (*WordPieceDecoder) DecodeChain added in v0.2.0

func (wd *WordPieceDecoder) DecodeChain(tokens []string) []string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL