compress

package
v1.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 21, 2026 License: MIT Imports: 27 Imported by: 0

Documentation

Overview

pkg/compress/compress.go

pkg/compress/compress_chunked.go

pkg/compress/compress_dict.go

pkg/compress/compress_xz.go

pkg/compress/compress_zip.go

pkg/compress/errors.go

pkg/compress/gitignore.go

pkg/compress/options.go

pkg/compress/pools.go

pkg/compress/progress.go

pkg/compress/result.go

Index

Constants

View Source
const (
	// MinDictSize is the minimum dictionary size required by zstd encoder
	// The zstd library uses internal history buffers that require at least 32KB
	MinDictSize = 32 * 1024

	// MaxDictSize is the maximum useful dictionary size
	MaxDictSize = 112 * 1024

	// MinSampleSizeForDict is the minimum individual sample size for dictionary training
	// Small samples are fine - the library handles them well
	// Only skip truly tiny samples that add noise without useful patterns
	MinSampleSizeForDict = 64
)

Variables

View Source
var (
	// ErrInputRequired is returned when input path is not specified
	ErrInputRequired = errors.New("input path is required")

	// ErrInvalidLevelZstd is returned when zstd compression level is out of range
	ErrInvalidLevelZstd = errors.New("compression level for GDELTA (zstd) must be between 1 and 22")

	// ErrInvalidLevelZip is returned when zip compression level is out of range
	ErrInvalidLevelZip = errors.New("compression level for ZIP (deflate) must be between 1 and 9")

	// ErrNoFiles is returned when no files are found to compress
	ErrNoFiles = errors.New("no regular files found to compress")

	// ErrZipNoChunking is returned when trying to use chunking with ZIP format
	ErrZipNoChunking = errors.New("chunk-based deduplication is not supported in ZIP format")

	// ErrZipNoDictionary is returned when trying to use dictionary with ZIP format
	ErrZipNoDictionary = errors.New("dictionary compression is not supported in ZIP format")

	// ErrXzNoChunking is returned when trying to use chunking with XZ format
	ErrXzNoChunking = errors.New("chunk-based deduplication is not supported in XZ format")

	// ErrXzNoDictionary is returned when trying to use dictionary with XZ format
	ErrXzNoDictionary = errors.New("dictionary compression is not supported in XZ format")

	// ErrXzNoZip is returned when trying to use both XZ and ZIP formats
	ErrXzNoZip = errors.New("cannot use both XZ and ZIP formats")

	// ErrInvalidLevelXz is returned when XZ compression level is out of range
	ErrInvalidLevelXz = errors.New("compression level for XZ (LZMA2) must be between 1 and 9")

	// ErrDictionaryNoChunking is returned when trying to use both dictionary and chunking
	ErrDictionaryNoChunking = errors.New("dictionary compression cannot be combined with chunking")

	// ErrInvalidParallelism is returned when parallelism strategy is invalid
	ErrInvalidParallelism = errors.New("parallelism must be 'auto', 'folder', or 'file'")

	// ErrChunkSizeTooSmall is returned when chunk size is below minimum
	ErrChunkSizeTooSmall = errors.New("chunk size must be at least 4KB (4096 bytes)")

	// ErrChunkSizeTooLarge is returned when chunk size exceeds reasonable maximum
	ErrChunkSizeTooLarge = errors.New("chunk size must not exceed 64MB (67108864 bytes)")
)

Functions

func FormatSize added in v0.0.5

func FormatSize(bytes uint64) string

FormatSize formats bytes into human-readable string

func FormatSummary added in v0.0.5

func FormatSummary(result *Result, opts *Options) string

FormatSummary formats a compression result into a human-readable summary string

func TruncateLeft added in v0.0.5

func TruncateLeft(path string, maxLen int) string

TruncateLeft truncates a path from the left to fit maxLen, preserving the filename

Types

type EventType

type EventType int

EventType indicates the type of progress event

const (
	EventStart EventType = iota
	EventFileStart
	EventFileProgress
	EventFileComplete
	EventComplete
	EventError
	EventDictTraining // Dictionary training phase for GDELTA03
)

type Options

type Options struct {
	// Input path (file or directory)
	// Ignored if Files is provided
	InputPath string

	// Files allows library users to provide a custom list of files/folders to compress
	// When set, InputPath is ignored
	// Each path can be absolute or relative, file or directory
	// This option is for library use only (not exposed in CLI)
	Files []string

	// Output archive path
	OutputPath string

	// Maximum number of concurrent compression threads
	// Default: runtime.NumCPU()
	MaxThreads int

	// Parallelism strategy: "auto", "folder", or "file"
	// Default: "auto"
	Parallelism Parallelism

	// Maximum memory per thread before flushing to disk (bytes)
	// 0 = unlimited (flush only at folder boundaries)
	// Default: 0
	MaxThreadMemory uint64

	// Chunk size for content-based deduplication (bytes)
	// 0 = disabled (traditional file-level compression)
	// Default: 0
	ChunkSize uint64

	// Maximum chunk store size in MB (bounds memory usage for deduplication)
	// Calculated as: maxChunks = ChunkStoreSize / (ChunkSize / 1MB)
	// 0 = unlimited (store all unique chunks)
	// Default: 0
	ChunkStoreSize uint64

	// Compression level (1-22 for zstd, 1-9 for zip deflate)
	// 1=fastest, 9=balanced, 19+=maximum compression (zstd only)
	// Default: 5
	Level int

	// UseZipFormat creates a standard ZIP archive instead of GDELTA format
	// Uses Deflate compression (universally compatible)
	// Cannot be combined with ChunkSize (deduplication not supported in ZIP mode)
	// Default: false
	UseZipFormat bool

	// UseXzFormat creates standard .tar.xz archives instead of GDELTA format
	// Uses LZMA2 compression (best compression ratio, slower than zstd)
	// Cannot be combined with ChunkSize or UseDictionary
	// Default: false
	UseXzFormat bool

	// UseDictionary enables GDELTA03 dictionary-based compression
	// Trains a zstd dictionary from input files for better compression
	// Especially effective for many small files with common patterns
	// Cannot be combined with ChunkSize or UseZipFormat
	// Default: false
	UseDictionary bool

	// DryRun simulates compression without writing
	DryRun bool

	// Verbose enables detailed logging
	Verbose bool

	// ProgressWriter receives progress updates (optional)
	// If nil and Quiet=false, progress goes to stdout
	ProgressWriter io.Writer

	// Quiet suppresses all output except errors
	Quiet bool

	// UseGitignore respects .gitignore files to exclude matching paths
	UseGitignore bool

	// DisableGC disables garbage collection during compression for maximum
	// throughput. Uses pooled buffers to minimize allocations. GC is re-enabled
	// after compression completes. Only affects ZIP compression mode.
	// Default: false
	DisableGC bool
}

Options configures the compression behavior

func DefaultOptions

func DefaultOptions() *Options

DefaultOptions returns options with sensible defaults

func (*Options) Validate

func (o *Options) Validate() error

Validate checks if options are valid

type Parallelism added in v0.0.10

type Parallelism string

Parallelism defines the parallelism strategy

const (
	// ParallelismAuto auto-detects based on input structure
	// Uses folder mode if enough folders, file mode otherwise
	ParallelismAuto Parallelism = "auto"

	// ParallelismFolder processes whole folders per worker (original behavior)
	// Best when: many folders with few files each
	ParallelismFolder Parallelism = "folder"

	// ParallelismFile processes individual files per worker with folder affinity
	// Files from same folder go to same worker for locality
	// Best when: flat directories or few folders with many files
	ParallelismFile Parallelism = "file"
)

type ProgressCallback

type ProgressCallback func(event ProgressEvent)

ProgressCallback is called for various progress events

func ProgressBarCallback added in v0.0.5

func ProgressBarCallback() (ProgressCallback, *mpb.Progress)

ProgressBarCallback creates a progress callback that displays multi-progress bars Returns the callback function and the progress container (call Wait() after compression)

type ProgressEvent

type ProgressEvent struct {
	Type           EventType
	FilePath       string
	Current        int64
	Total          int64
	CurrentBytes   uint64
	TotalBytes     uint64
	CompressedSize uint64
}

ProgressEvent contains progress information

type Result

type Result struct {
	// Total number of files found
	FilesTotal int

	// Number of files successfully compressed
	FilesProcessed int

	// Total original size in bytes
	OriginalSize uint64

	// Total compressed size in bytes
	CompressedSize uint64

	// ChunkSize is the configured chunk size (0 if chunking disabled)
	ChunkSize uint64

	// Chunk deduplication statistics (when chunking enabled)
	TotalChunks   uint64 // Total chunks processed
	UniqueChunks  uint64 // Unique chunks stored
	DedupedChunks uint64 // Chunks that were deduplicated
	BytesSaved    uint64 // Bytes saved through deduplication
	Evictions     uint64 // Chunks evicted from LRU cache (doesn't affect archive)

	// List of errors encountered (non-fatal)
	Errors []error
}

Result contains statistics about the compression operation

func Compress

func Compress(opts *Options, progressCb ProgressCallback) (*Result, error)

Compress compresses files from inputPath into an archive at outputPath

func (*Result) CompressionRatio

func (r *Result) CompressionRatio() float64

CompressionRatio returns the compression ratio as a percentage

func (*Result) DedupRatio added in v0.0.2

func (r *Result) DedupRatio() float64

DedupRatio returns the deduplication ratio as a percentage

func (*Result) GetCompressedSize added in v0.0.5

func (r *Result) GetCompressedSize() uint64

GetCompressedSize returns compressed size (interface method)

func (*Result) GetErrors added in v0.0.5

func (r *Result) GetErrors() []error

GetErrors returns the error list (interface method)

func (*Result) GetFilesProcessed added in v0.0.5

func (r *Result) GetFilesProcessed() int

GetFilesProcessed returns processed files (interface method)

func (*Result) GetFilesTotal added in v0.0.5

func (r *Result) GetFilesTotal() int

GetFilesTotal returns total files (interface method)

func (*Result) GetOriginalSize added in v0.0.5

func (r *Result) GetOriginalSize() uint64

GetOriginalSize returns original size (interface method)

func (*Result) Success

func (r *Result) Success() bool

Success returns true if all files were processed without errors

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL