convert

package
v0.0.38 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 18, 2022 License: LGPL-3.0 Imports: 28 Imported by: 0

Documentation

Overview

Package convert is extends Go's x/text/encoding capability to convert legacy encoded text to a modern UTF-8 encoding.

Example (Swap)
fmt.Print(string(swap(DEL)))
fmt.Print(string(swap(SquareRoot)))
Output:

Δ✓

Index

Examples

Constants

View Source
const (
	DosSUB    = 8594
	SymbolSUB = 9242
)
View Source
const (
	// NUL Null control code.
	NUL = iota
	// SOH Start of heading.
	SOH
	// STX Start of text.
	STX
	// ETX End of text.
	ETX
	// EOT End of transmission.
	EOT
	// ENQ Enquiry.
	ENQ
	// ACK Acknowledge.
	ACK
	// BEL Bell or alert.
	BEL
	// BS Backspace.
	BS
	// HT Horizontal tabulation.
	HT
	// LF Line feed.
	LF
	// VT Vertical tabulation.
	VT
	// FF Form feed.
	FF
	// CR Carriage return.
	CR
	// SO Shift out.
	SO
	// SI Shift in.
	SI
	// DLE Data Link Escape.
	DLE
	// DC1 Device control one.
	DC1
	// DC2 Device control two.
	DC2
	// DC3 Device control three.
	DC3
	// DC4 Device control four.
	DC4
	// NAK Negative acknowledge.
	NAK
	// SYN Synchronous idle.
	SYN
	// ETB End of transmission block.
	ETB
	// CAN Cancel.
	CAN
	// EM End of medium.
	EM
	// SUB Substitute.
	SUB
	// ESC Escape.
	ESC
	// FS File separator.
	FS
	// GS Group separator.
	GS
	// RS Record separator.
	RS
	// US Unit separator.
	US
	// SP Space.
	SP
)
View Source
const (
	// LeftSquareBracket [.
	LeftSquareBracket = 91
	// VerticalBar |.
	VerticalBar = 124
	// DEL Delete.
	DEL = 127
	// Dash Hyphen -.
	Dash = 150
	// Nbsp Non-breaking space.
	Nbsp = 160
	// InvertedExclamation ¡.
	InvertedExclamation = 161
	// Cent ¢.
	Cent = 162
	// BrokenBar ¦.
	BrokenBar = 166
	// Negation ¬.
	Negation = 172
	// PlusMinus ±.
	PlusMinus = 177
	// LightVertical light vertical │.
	LightVertical = 179
	// SquareRoot Square root √.
	SquareRoot = 251
	// NBSP Non-breaking space.
	NBSP = 255
	// Delta Δ.
	Delta = 916
	// LeftwardsArrow ←.
	LeftwardsArrow = 8592
	// SquareRootU Unicode square root √.
	SquareRootU = 8730
	// House ⌂.
	House = 8962
	// IntegralExtension ⎮.
	IntegralExtension = 9134
	// SymbolNUL ␀.
	SymbolNUL = 9216
	// SymbolESC ␛.
	SymbolESC = 9243
	// SymbolDEL ␡.
	SymbolDEL = 9249
	// LightVerticalU Box drawing light vertical │.
	LightVerticalU = 9474
	// CheckMark ✓.
	CheckMark = 10003
	// Replacement character �.
	Replacement = 65533
	// Open Box ␣.
	OpenBox = 9251
)

Variables

View Source
var (
	// AsaX34_1963 ASA X3.4 1963.
	AsaX34_1963 encoding.Encoding = &x34_1963 // nolint: gochecknoglobals

	// AsaX34_1965 ASA X3.4 1965.
	AsaX34_1965 encoding.Encoding = &x34_1965 // nolint: gochecknoglobals

	// AnsiX34_1967 ANSI X3.4 1967/77/86.
	AnsiX34_1967 encoding.Encoding = &x34_1967 // nolint: gochecknoglobals

)
View Source
var (
	ErrChainANSI = errors.New("ansi() is a chain method that is to be used" +
		" in conjunction with swap: c.swap().ansi()")
	ErrChainWrap = errors.New("wrapWidth() is a chain method that is to be" +
		" used in conjunction with swap: c.swap().wrapWidth()")
	ErrBytes    = errors.New("cannot transform an empty byte slice")
	ErrEncoding = errors.New("no encoding provided")
	ErrName     = errors.New("encoding cannot match name or alias")
	ErrUTF8     = errors.New("string cannot encode to utf-8")
	ErrUTF16    = errors.New("utf-16 table encodings are not supported")
	ErrUTF32    = errors.New("utf-32 table encodings are not supported")
	ErrWidth    = errors.New("cannot determine the number columns from using line break")
)
View Source
var (
	ErrNilEncoding = errors.New("character encoding cannot be a nil value")
)
View Source
var (
	ErrNoName = errors.New("there is no encoding name")
)

Functions

func AliasFmt added in v0.0.38

func AliasFmt(alias, value string, e encoding.Encoding) (string, error)

AliasFmt return character encoding aliases.

func AsaX34 added in v0.0.38

func AsaX34(e encoding.Encoding) string

AsaX34 returns a named value for the legacy ASA ASCII character encodings.

func BOM

func BOM() []byte

BOM is the UTF-8 byte order mark prefix.

Example
fmt.Printf("%X", BOM())
Output:

EFBBBF

func D437

func D437(s string) ([]byte, error)

D437 decodes IBM Code Page 437 encoded text.

Example
const name = base + "cp437In.txt"
result, err := D437(cp437hex)
if err != nil {
	log.Fatal(err)
}
_, err = filesystem.SaveTemp(name, result...)
if err != nil {
	log.Fatal(err)
}
t, err := filesystem.ReadText(name)
if err != nil {
	log.Fatal(err)
}
fmt.Print(t)
Output:

═╣▓╠═

func DString

func DString(s string, c *charmap.Charmap) ([]byte, error)

DString decodes simple character encoding text.

func E437

func E437(s string) ([]byte, error)

E437 encodes text into IBM Code Page 437.

Example
const name = base + "cp437.txt"
result, err := E437(utf)
if err != nil {
	log.Fatal(err)
}
_, err = filesystem.SaveTemp(name, result...)
if err != nil {
	log.Fatal(err)
}
t, err := filesystem.ReadText(name)
if err != nil {
	log.Fatal(err)
}
filesystem.Clean(name)
fmt.Print(len(t))
Output:

8

func EString

func EString(s string, c *charmap.Charmap) ([]byte, error)

EString encodes text into a simple character encoding.

func Encoder added in v0.0.33

func Encoder(name string) (encoding.Encoding, error)

Encoder returns the named character set encoder.

func Encodings

func Encodings() []encoding.Encoding

Encodings returns all the supported legacy text encodings.

func HexDecode

func HexDecode(s string) ([]byte, error)

HexDecode decodes a hexadecimal string into bytes.

func HexEncode

func HexEncode(s string) []byte

HexEncode encodes a string into hexadecimal bytes.

func Humanize added in v0.0.31

func Humanize(name string) string

Humanize the encoding by using an shorter, less formal name.

func ISO11Name added in v0.0.38

func ISO11Name(name string) bool

func List

func List() *bytes.Buffer

List returns a tabled list of supported IANA character set encodings.

func MakeBytes

func MakeBytes() []byte

MakeBytes generates a 256 character or 8-bit container ready to hold legacy code point values.

func Mark

func Mark(b []byte) []byte

Mark adds a UTF-8 byte order mark to the text if it doesn't already exist.

func Numeric added in v0.0.38

func Numeric(name string) int

Numeric returns a numeric alias for a character encoding. A -1 int is returned whenever an alias could not be generated. Unicode based encodings always return -1.

func Table

func Table(name string) (*bytes.Buffer, error)

Table prints out all the characters in the named 8-bit character set.

func TrimEOF added in v0.0.33

func TrimEOF(b []byte) []byte

TrimEOF will cut text at the first occurrence of the SUB character. The SUB is used by DOS and CP/M as an end-of-file marker.

func Uniform added in v0.0.38

func Uniform(mime string) string

Uniform formats MIME values.

Types

type Cell added in v0.0.38

type Cell struct {
	Name    string
	Value   string
	Numeric string
	Alias   string
}

func Cells added in v0.0.38

func Cells(e encoding.Encoding) (Cell, error)

Cells return character encoding details for use in a text table.

type Convert added in v0.0.31

type Convert struct {
	Flags  Flag   // Commandline supplied flag values.
	Input  In     // Input text for transformation.
	Output []rune // Transformed UTF-8 runes.
	// contains filtered or unexported fields
}

Convert 8-bit legacy or other Unicode text to UTF-8.

func (*Convert) ANSI added in v0.0.31

func (c *Convert) ANSI(b ...byte) ([]rune, error)

ANSI transforms legacy encoded ANSI into modern UTF-8 text. It displays ASCII control codes as characters. It obeys the DOS end of file marker.

func (*Convert) ANSIControls added in v0.0.31

func (c *Convert) ANSIControls() *Convert

ANSIControls replaces out all ←[ and ␛[ character matches with functional ANSI escape controls.

func (*Convert) Chars added in v0.0.31

func (c *Convert) Chars(b ...byte) ([]rune, error)

Chars transforms legacy encoded characters and text control codes into UTF-8 characters. It displays both ASCII and ANSI control codes as characters. It ignores the DOS end of file marker.

func (*Convert) Dump added in v0.0.31

func (c *Convert) Dump(b ...byte) ([]rune, error)

Dump transforms legacy encoded text or ANSI into modern UTF-8 text. It obeys common ASCII control codes. It ignores the DOS end of file marker.

func (*Convert) LineBreaks added in v0.0.31

func (c *Convert) LineBreaks()

LineBreaks will try to guess the line break representation as a 2 byte value. A guess of Unix will return [10, 0], Windows [13, 10], otherwise a [0, 0] value is returned.

func (*Convert) RunesControls added in v0.0.31

func (c *Convert) RunesControls()

RunesControls switches out C0 and C1 ASCII controls with Unicode Control Picture represenations.

func (*Convert) RunesControlsEBCDIC added in v0.0.31

func (c *Convert) RunesControlsEBCDIC()

RunesControlsEBCDIC switches out EBCDIC controls with Unicode Control Picture represenations.

func (*Convert) RunesDOS added in v0.0.31

func (c *Convert) RunesDOS()

RunesDOS switches out C0, C1 and other controls with PC/MS-DOS picture glyphs.

func (*Convert) RunesEBCDIC added in v0.0.31

func (c *Convert) RunesEBCDIC()

RunesEBCDIC switches out EBCDIC IBM mainframe controls with Unicode picture represenations. Where no appropriate picture exists a space placeholder is used.

func (*Convert) RunesKOI8 added in v0.0.31

func (c *Convert) RunesKOI8()

RunesKOI8 blanks out unused C0, C1 and other controls spaces for Russian sets.

func (*Convert) RunesLatin added in v0.0.31

func (c *Convert) RunesLatin()

RunesLatin blanks out unused C0, C1 and other controls spaces for ISO Latin sets.

func (*Convert) RunesMacintosh added in v0.0.31

func (c *Convert) RunesMacintosh()

RunesMacintosh replaces specific Mac OS Roman characters with Unicode picture represenations.

func (*Convert) RunesShiftJIS added in v0.0.31

func (c *Convert) RunesShiftJIS()

RunesShiftJIS tweaks some Unicode picture represenations for Shift-JIS.

func (*Convert) RunesUTF8 added in v0.0.31

func (c *Convert) RunesUTF8()

RunesUTF8 tweaks some Unicode picture represenations for UTF-8 Basic Latin.

func (*Convert) RunesWindows added in v0.0.31

func (c *Convert) RunesWindows()

RunesWindows tweaks some Unicode picture represenations for Windows-125x sets.

func (*Convert) Swap added in v0.0.31

func (c *Convert) Swap() *Convert

Swap transforms character map and control codes into UTF-8 unicode runes.

func (*Convert) Text added in v0.0.31

func (c *Convert) Text(b ...byte) ([]rune, error)

Text transforms legacy encoded text or ANSI into modern UTF-8 text. It obeys common ASCII control codes. It obeys the DOS end of file marker.

func (*Convert) Transform added in v0.0.31

func (c *Convert) Transform() error

Transform byte data from named character map encoded text into UTF-8.

type Encoding

type Encoding struct {
	encoding.Encoding
	Name string
}

Encoding is an implementation of the Encoding interface that adds the String and ID methods to an existing encoding.

func (Encoding) String added in v0.0.33

func (e Encoding) String() string

type Flag added in v0.0.33

type Flag struct {
	Controls  []string // Always use these control codes.
	SwapChars []string // Swap out these characters with UTF-8 alternatives.
	MaxWidth  int      // Maximum text width per-line.
}

Flag are the user supplied values.

type In added in v0.0.33

type In struct {
	Encoding encoding.Encoding // Bytes text encoding.
	Bytes    []byte            // Input text as bytes.
	// contains filtered or unexported fields
}

In is the text input for conversion.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL