converter

package
v0.2.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 10, 2023 License: Apache-2.0 Imports: 61 Imported by: 1

Documentation

Overview

Package converter implements function for converting files to PDF

Index

Constants

View Source
const DefaultMaxSubprocMemoryBytes = 2 << 30 // 2GiB
View Source
const ErrTextFn = "ZZZ-errors.txt"

name of errors list in resulting archive

View Source
const LofficeLockPort = 27999

port for LibreOffice locking (only one instance should be running)

Variables

View Source
var (
	// ConfPdftk is the path for PdfTk
	ConfPdftk = config.String("pdftk", lookPath("pdftk"))

	// ConfPdfseparate is the path for pdfseparate (member of poppler-utils
	ConfPdfseparate = config.String("pdfseparate", "pdfseparate")

	// ConfLoffice is the path for LibreOffice
	ConfLoffice = config.String("loffice", lookPath("loffice"))

	// ConfGm is the path for GraphicsMagick
	ConfGm = config.String("gm", lookPath("gm"))

	// ConfGs is the path for GhostScript
	ConfGs = config.String("gs", lookPath("gs"))

	// ConfPdfClean is the path for pdfclean
	ConfPdfClean = config.String("pdfclean", lookPath("pdfclean"))

	// ConfMutool is the path for mutool
	ConfMutool = config.String("mutool", lookPath("mutool"))

	// ConvWkhtmltopdf is the parth for wkhtmltopdf
	ConfWkhtmltopdf = config.String("wkhtmltopdf", lookPath("wkhtmltopdf"))

	// ConfSortBeforeMerge should be true if generally we should sort files by filename before merge
	ConfSortBeforeMerge = config.Bool("sortBeforeMerge", false)

	// ConfChildTimeout is the time before the child gets killed
	ConfChildTimeout = config.Duration("childTimeout", 10*time.Minute)

	// ConfLofficeTimeout is the time before LibreOffice gets killed.
	ConfLofficeTimeout = config.Duration("lofficeTimeout", time.Minute)

	// ConcLimit limits the concurrently running child processes
	ConcLimit = NewRateLimiter(Concurrency)

	// ConfWorkdir is the working directory (will be os.TempDir() if empty)
	ConfWorkdir = config.String("workdir", "")

	// ConfListenAddr is a listen address for HTTP requests
	ConfListenAddr = config.String("listen", ":9500")

	// ConfDefaultIsService decides whether start as service without args
	ConfDefaultIsService = config.Bool("defaultIsService", false)

	// ConfUseLofficePortLock defines whether to limit Loffice usage by a port lock
	ConfLofficeUsePortLock = config.Bool("lofficeUsePortLock", !osgroup.IsInsideDocker())

	// ConfLogFile specifies the file to log - instead of command line.
	ConfLogFile = config.String("logfile", "")

	// ConfKeepRemoteImage specifiec whether to keep the remote sources of images (mg src="http://mailtrack...").
	ConfKeepRemoteImage = config.Bool("keepRemoteImage", false)

	// ConfGotenbertURL is the working Gotenbert (https://pkg.go.dev/github.com/gotenberg/gotenberg/v7) service URL
	ConfGotenbergURL = &gotenberg.URL

	// ConfMaxSubprocMemoryBytes is the limit for subprocess' memory.
	ConfMaxSubprocMemoryBytes = config.Uint64("max-subproc-mem-bytes", DefaultMaxSubprocMemoryBytes)
)
View Source
var Concurrency = int(8)

Concurrency is the default concurrent goroutines number

View Source
var ErrBadPDF = errors.New("bad pdf")
View Source
var ErrPasswordProtected = errors.New("password protected")
View Source
var ErrSkip = errors.New("skip this part")
View Source
var Exec procRunner
View Source
var ExtContentType = map[string]string{
	"doc":  "application/vnd.ms-word",
	"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"dotx": "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
	"xls":  "application/vnd.ms-excel",
	"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	"xltx": "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
	"ppt":  "application/vnd.ms-powerpoint",
	"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
	"ppsx": "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
	"potx": "application/vnd.openxmlformats-officedocument.presentationml.template",

	"odg": "application/vnd.oasis.opendocument.graphics",
	"otg": "application/vnd.oasis.opendocument.graphics-template",
	"otp": "application/vnd.oasis.opendocument.presentation-template",
	"odp": "application/vnd.oasis.opendocument.presentation",
	"odm": "application/vnd.oasis.opendocument.text-master",
	"odt": "application/vnd.oasis.opendocument.text",
	"oth": "application/vnd.oasis.opendocument.text-web",
	"ott": "application/vnd.oasis.opendocument.text-template",
	"ods": "application/vnd.oasis.spreadsheet",
	"ots": "application/vnd.oasis.spreadsheet-template",
	"odc": "application/vnd.oasis.chart",
	"odf": "application/vnd.oasis.formula",
	"odb": "application/vnd.oasis.database",
	"odi": "application/vnd.oasis.image",

	"txt": textPlain,
	"msg": mimeOutlook,

	"jpg":  "image/jpeg",
	"jpeg": "image/jpeg",
	"gif":  "image/gif",
	"png":  "image/png",
	"tif":  "image/tif",
	"tiff": "image/tiff",
}

file extension -> content-type map

View Source
var Filters = make([]FilterFunc, 0, 6)

Filters is the filter pipeline - order is application order

View Source
var LeaveTempFiles = false

LeaveTempFiles should be true only for debugging purposes (leaves temp files)

View Source
var OtherToPdf = OfficeToPdf

OtherToPdf is the default converter

View Source
var PrependHeaders = []string{"From", "To", "Cc", "Subject", "Date"}

PrependHeaders are the headers which should be prepended to the printed mail

View Source
var SaveOriginalHTML = false

save original html (do not delete it)

View Source
var Workdir = os.TempDir()

Workdir is the main working directory

View Source
var WriteTextAsPDF func(w io.Writer, r io.Reader) error

Functions

func DupFilter

func DupFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

func ExtractingFilter

func ExtractingFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

ExtractingFilter is a filter for the mail pipeline which extracts archives

func FixContentType

func FixContentType(body []byte, contentType, fileName string) (ct string)

FixContentType ensures proper content-type (uses magic for "" and application/octet-stream)

func GetRequestID added in v0.2.0

func GetRequestID(ctx context.Context) string

func HTMLPartFilter

func HTMLPartFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

HTMLPartFilter reads multipart/alternative (text/plain + text/html), preferring the html part + groups the multipart/related images which are referred in the html.

multipart/related encapsulates multipart/alternative, which contains text/plain and text/html, the related part contains images, too - at least usually.

func HTMLToPdf

func HTMLToPdf(ctx context.Context, destfn string, r io.Reader, contentType string) error

HTMLToPdf converts HTML (text/html) to PDF

func ImageToPdf

func ImageToPdf(ctx context.Context, destfn string, r io.Reader, contentType string) error

ImageToPdf convert image (image/...) to PDF

func ImageToPdfGm

func ImageToPdfGm(ctx context.Context, w io.Writer, r io.Reader, contentType string) error

ImageToPdfGm converts image to PDF using GraphicsMagick

func ImageToPdfPdfCPU added in v0.2.0

func ImageToPdfPdfCPU(w io.Writer, r io.Reader) error

ImageToPdfPdfCPU converts image to PDF using pdfcpu

func LoadConfig

func LoadConfig(ctx context.Context, fn string) error

LoadConfig loads TOML config file

func MIMEMatch added in v0.0.3

func MIMEMatch(b []byte) string

func MPRelatedToPdf

func MPRelatedToPdf(ctx context.Context, destfn string, r io.Reader, contentType string) error

MPRelatedToPdf converts multipart/related to PDF

func MailToPdfZip

func MailToPdfZip(ctx context.Context, destfn string, body io.Reader, contentType string) error

func MailToSplittedPdfZip

func MailToSplittedPdfZip(ctx context.Context, destfn string, body io.Reader,
	contentType string, split bool, imgmime, imgsize string,
	pages []uint16,
) error

MailToSplittedPdfZip converts mail to ZIP of PDFs and images

func MailToTree

func MailToTree(ctx context.Context, outdir string, r io.Reader) error

MailToTree writes mail parts as files starting at outdir as root, trying to reimplement the mime hierarchy in the directory hierarchy

func MailToZip

func MailToZip(ctx context.Context, destfn string, body io.Reader, contentType string) error

MailToZip dumps mail and all parts into ZIP

func NewB64QuoPriDecoder

func NewB64QuoPriDecoder(r io.Reader) io.Reader

NewB64QuoPriDecoder replaces bork encoding (+base64-)

func NewCidMapper

func NewCidMapper(cids map[string]string, subDir string, r io.Reader) io.Reader

NewCidMapper remaps Content-Id urls to ContentDir/filename and returns the map

func NewEqsignStripper

func NewEqsignStripper(r io.Reader) io.Reader

NewEqsignStripper returns a reader which strips equal signs from line endings

func NewOLEStorageReader

func NewOLEStorageReader(ctx context.Context, r io.Reader) (io.ReadCloser, error)

NewOLEStorageReader converts Outlook .msg files to .eml RFC822 email files. For this it uses perl Email::Outlook::Message (thanks, @matijs), and returns an io.Reader with the converted data.

This calls out to perl, and needs Email::Outlook::Message (can be installed with `cpan -i Email::Outlook::Message`).

See http://www.matijs.net/software/msgconv

func NewQuoPriDecoder

func NewQuoPriDecoder(r io.Reader) io.Reader

NewQuoPriDecoder replaces =A0= with \n

func NewScannerReader

func NewScannerReader(s *bufio.Scanner) io.Reader

NewScannerReader turns a bufio.Scanner to an io.Reader

func NewTextReader

func NewTextReader(ctx context.Context, r io.Reader, charset string) io.Reader

NewTextReader wraps a reader with a proper charset converter

func NewULID added in v0.2.0

func NewULID() ulid.ULID

func OfficeToPdf

func OfficeToPdf(ctx context.Context, destfn string, r io.Reader, contentType string) error

OfficeToPdf converts other to PDF with LibreOffice

func OutlookToEML added in v0.0.3

func OutlookToEML(ctx context.Context, destfn string, r io.Reader, contentType string) error

func PdfClean

func PdfClean(ctx context.Context, fn string) (err error)

PdfClean cleans PDF from restrictions

func PdfDumpFdf

func PdfDumpFdf(ctx context.Context, destfn, inpfn string) error

PdfDumpFdf dumps the FDF from the given PDF.

func PdfDumpFields

func PdfDumpFields(ctx context.Context, inpfn string) ([]string, error)

PdfDumpFields dumps the field names from the given PDF.

func PdfFillFdf

func PdfFillFdf(ctx context.Context, destfn, inpfn string, values map[string]string) error

PdfFillFdf fills the FDF and generates PDF.

func PdfMerge

func PdfMerge(ctx context.Context, destfn string, filenames ...string) error

PdfMerge merges pdf files into destfn

func PdfPageNum

func PdfPageNum(ctx context.Context, srcfn string) (numberofpages int, err error)

PdfPageNum returns the number of pages

func PdfRewrite

func PdfRewrite(ctx context.Context, destfn, srcfn string) error

PdfRewrite converts PDF to PDF (rewrites as PDF->PS->PDF)

func PdfSplit

func PdfSplit(ctx context.Context, srcfn string, pages []uint16) (filenames []string, cleanup func() error, err error)

PdfSplit splits pdf to pages, returns those filenames

func PdfToImage

func PdfToImage(ctx context.Context, w io.Writer, r io.Reader, contentType, size string) error

PdfToImage converts PDF to image using PdfToImageGm if available and the result is OK, then PdfToImageCairo.

func PdfToImageCairo

func PdfToImageCairo(ctx context.Context, w io.Writer, r io.Reader, contentType, size string) error

PdfToImageCairo converts PDF to image using pdftocairo from poppler-utils.

func PdfToImageGm

func PdfToImageGm(ctx context.Context, w io.Writer, r io.Reader, contentType, size string) error

PdfToImageGm converts PDF to image using GraphicsMagick.

func PdfToImageMulti

func PdfToImageMulti(ctx context.Context, sfiles []string, imgmime, imgsize string) (imgfilenames []string, err error)

PdfToImageMulti converts PDF pages to images, using parallel threads

func PdfToPdf

func PdfToPdf(ctx context.Context, destfn string, r io.Reader, _ string) error

PdfToPdf "converts" PDF (application/pdf) to PDF (just copies)

func PdfToPs

func PdfToPs(ctx context.Context, destfn, srcfn string) error

PdfToPs converts PDF to postscript

func PngToImage added in v0.2.0

func PngToImage(ctx context.Context, w io.Writer, imgtyp string, r io.Reader) error

func PrependHeaderFilter

func PrependHeaderFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

PrependHeaderFilter writes Subject, From... headers at the beginning of the html/plain parts.

func PsToPdf

func PsToPdf(ctx context.Context, destfn, srcfn string) error

PsToPdf converts postscript to PDF

func SaveOriHTMLFilter

func SaveOriHTMLFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

SaveOriHTMLFilter reads text/html and saves it.

func ScanLines

func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error)

ScanLines is a split function for a Scanner that returns each line of text, unmodified. The returned line may be empty. The end-of-line marker is one optional carriage return followed by one mandatory newline. In regular expression notation, it is `\r?\n`. The last non-empty line of input will be returned even if it has no newline.

func SetLogger added in v0.1.0

func SetLogger(lgr *slog.Logger)

func SetRequestID added in v0.2.0

func SetRequestID(ctx context.Context, reqID string) context.Context

func SetupFilters

func SetupFilters(
	ctx context.Context,
	inch <-chan i18nmail.MailPart, resultch chan<- ArchFileItem,
	errch chan<- error,
) <-chan i18nmail.MailPart

SetupFilters applies filters on parts received on inch, and returns them on outch

func Skip

func Skip(ctx context.Context, destfn string, r io.Reader, contentType string) error

Skip skips the conversion

func SlurpMail

func SlurpMail(ctx context.Context, partch chan<- i18nmail.MailPart, errch chan<- error, body io.Reader, contentType string)

SlurpMail splits mail to parts, returns parts and/or error on the given channels

func TextDecodeFilter

func TextDecodeFilter(ctx context.Context,
	inch <-chan i18nmail.MailPart, outch chan<- i18nmail.MailPart,
	files chan<- ArchFileItem, errch chan<- error,
)

TextDecodeFilter writes Subject, From... headers at the beginning of the html/plain parts.

func TextToPdf

func TextToPdf(ctx context.Context, destfn string, r io.Reader, contentType string) error

TextToPdf converts text (text/plain) to PDF

func ZipFiles

func ZipFiles(dest io.Writer, skipOnError, unsafeArchFn bool, files ...ArchFileItem) (err error)

ZipFiles adds files (by handle) to zip (writer)

func ZipTree

func ZipTree(dest io.Writer, root string, skipOnError, unsafeArchFn bool) (err error)

ZipTree adds all files in the tree originating the given path to zip (writer)

Types

type ArchFileItem

type ArchFileItem struct {
	File     FileLike //opened file handle
	Error    error    //error
	Filename string   //name of the file
	Archive  string   //name in the archive
}

ArchFileItem groups an archive item

func MailToPdfFiles

func MailToPdfFiles(ctx context.Context, r io.Reader, contentType string) (files []ArchFileItem, err error)

MailToPdfFiles converts email to PDF files all mail part goes through all filter in Filters, in reverse order (last first)

func (ArchFileItem) ArchiveName

func (a ArchFileItem) ArchiveName() string

ArchiveName returns the archive name - Archive, Filename if set, otherwise File's name

type ArchItems

type ArchItems []ArchFileItem

ArchItems is a wrapper for []ArchFileItem for sort.Sort

func (ArchItems) Len

func (a ArchItems) Len() int

Len returns the length of ArchItems

func (ArchItems) Less

func (a ArchItems) Less(i, j int) bool

Less returns whether a[i] < a[j]

func (ArchItems) Sort

func (a ArchItems) Sort() ArchItems

Sort sorts ArchItems ArchiveName-ordered

func (ArchItems) Swap

func (a ArchItems) Swap(i, j int)

Swap swaps items i and j for sort.Sort

type Converter

type Converter func(context.Context, string, io.Reader, string) error

Converter converts to Pdf (destination filename, source reader and source content-type)

func GetConverter

func GetConverter(contentType string, mediaType map[string]string) (converter Converter)

GetConverter gets converter for the content-type

func NewTextConverter

func NewTextConverter(charset string) Converter

NewTextConverter converts encoded text to pdf - by decoding it

func (Converter) WithCache added in v0.1.0

func (c Converter) WithCache(ctx context.Context, destfn string, r io.Reader, sourceContentType, destContentType string) error

type FieldSetter

type FieldSetter interface {
	Set(key, value string) error
}

type FileLike

type FileLike interface {
	io.Reader
	io.Closer
	Statter
}

FileLike is a minimal needed interface for ArchFileItem.File

func MakeFileLike

func MakeFileLike(r io.Reader) FileLike

type FileMIMEDetector added in v0.0.3

type FileMIMEDetector struct{}

func (FileMIMEDetector) Match added in v0.0.3

func (d FileMIMEDetector) Match(b []byte) string

type FilterFunc

type FilterFunc func(context.Context, <-chan i18nmail.MailPart, chan<- i18nmail.MailPart, chan<- ArchFileItem, chan<- error)

FilterFunc is the type for the pipeline filter function must close out channel on finish!

type Gotenberg added in v0.2.0

type Gotenberg struct {
	URL string

	Client *http.Client
	// contains filtered or unexported fields
}

func (*Gotenberg) PostFileNames added in v0.2.0

func (g *Gotenberg) PostFileNames(ctx context.Context, destfn string, urlPath string, filenames []string, contentType string) error

func (*Gotenberg) Valid added in v0.2.0

func (g *Gotenberg) Valid() bool

type HTTPMIMEDetector added in v0.0.3

type HTTPMIMEDetector struct{}

func (HTTPMIMEDetector) Match added in v0.0.3

func (d HTTPMIMEDetector) Match(b []byte) string

type MIMEDetector added in v0.0.3

type MIMEDetector interface {
	Match([]byte) string
}

type MagicMIMEDetector added in v0.2.4

type MagicMIMEDetector struct{}

func (MagicMIMEDetector) Match added in v0.2.4

func (d MagicMIMEDetector) Match(b []byte) string

type MultiMIMEDetector added in v0.0.3

type MultiMIMEDetector struct {
	Detectors []MIMEDetector
	Parallel  bool
}

func (MultiMIMEDetector) Match added in v0.0.3

func (d MultiMIMEDetector) Match(b []byte) string

type PortLock

type PortLock struct {
	// contains filtered or unexported fields
}

PortLock is a locker which locks by binding to a port on the loopback IPv4 interface

func NewPortLock

func NewPortLock(port int) *PortLock

NewPortLock returns a lock for port

func (*PortLock) Lock

func (p *PortLock) Lock()

Lock locks on port

func (*PortLock) Unlock

func (p *PortLock) Unlock()

Unlock unlocks the port lock

type RateLimiter

type RateLimiter interface {
	//Acquire acquires a token (blocks if none accessible)
	Acquire() Token
	//Release releases the token
	Release(Token)
}

RateLimiter is the interface for rate limiting

func NewRateLimiter

func NewRateLimiter(n int) RateLimiter

NewRateLimiter returns a RateLimiter

type ReadCloserFileLike

type ReadCloserFileLike struct {
	io.Reader
	io.Closer
	os.FileInfo
	// contains filtered or unexported fields
}

func (ReadCloserFileLike) Stat

func (fl ReadCloserFileLike) Stat() (os.FileInfo, error)

type ScannerReader

type ScannerReader struct {
	// contains filtered or unexported fields
}

ScannerReader uses a bufio.Scanner as an io.Reader

func (*ScannerReader) Read

func (sr *ScannerReader) Read(p []byte) (n int, err error)

Implements io.Reader: reads at most len(p) bytes into p, returns the number of bytes read and/or the error encountered

type Statter

type Statter interface {
	Stat() (os.FileInfo, error)
}

type Token

type Token struct{}

Token is a token

type VasileMIMEDetector added in v0.0.3

type VasileMIMEDetector struct{}

func (VasileMIMEDetector) Match added in v0.0.3

func (d VasileMIMEDetector) Match(b []byte) string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL