core

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 20, 2025 License: MIT Imports: 3 Imported by: 0

Documentation

Overview

Package core provides the core types and interfaces for the Parity dataset comparison tool.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type DatasetReader

type DatasetReader interface {
	// Read returns a record batch and an error if any.
	// Returns io.EOF when there are no more batches.
	Read(ctx context.Context) (arrow.Record, error)

	// Schema returns the schema of the dataset.
	Schema() *arrow.Schema

	// Close closes the reader and releases resources.
	Close() error
}

DatasetReader defines an interface for reading data from various sources.

type DatasetWriter

type DatasetWriter interface {
	// Write writes a record to the destination.
	Write(ctx context.Context, record arrow.Record) error

	// Close closes the writer and flushes any pending data.
	Close() error
}

DatasetWriter defines an interface for writing data to various destinations.

type DiffOptions

type DiffOptions struct {
	// KeyColumns specifies the columns to use as keys for matching records.
	KeyColumns []string

	// IgnoreColumns specifies columns to ignore when comparing records.
	IgnoreColumns []string

	// BatchSize is the size of batches to process at once.
	BatchSize int64

	// Tolerance is the tolerance to use for floating point comparisons.
	Tolerance float64

	// Parallel indicates whether to use parallel processing.
	Parallel bool

	// NumWorkers is the number of workers to use for parallel processing.
	// If 0, defaults to the number of CPUs.
	NumWorkers int
}

DiffOptions provides options for the diff operation.

type DiffResult

type DiffResult struct {
	// Added contains records that exist in the target but not in the source.
	Added arrow.Record

	// Deleted contains records that exist in the source but not in the target.
	Deleted arrow.Record

	// Modified contains records that exist in both but have different values.
	// This includes columns to indicate which fields were modified.
	Modified arrow.Record

	// Summary provides a summary of the differences.
	Summary DiffSummary
}

DiffResult represents the difference between two datasets.

type DiffSummary

type DiffSummary struct {
	// TotalSource is the total number of records in the source dataset.
	TotalSource int64

	// TotalTarget is the total number of records in the target dataset.
	TotalTarget int64

	// Added is the number of records added.
	Added int64

	// Deleted is the number of records deleted.
	Deleted int64

	// Modified is the number of records modified.
	Modified int64

	// Columns is a map of column names to the number of modifications in that column.
	Columns map[string]int64
}

DiffSummary provides a summary of the differences between two datasets.

type Differ

type Differ interface {
	// Diff computes the difference between two datasets.
	Diff(ctx context.Context, source, target DatasetReader, options DiffOptions) (*DiffResult, error)
}

Differ defines an interface for computing differences between datasets.

type ReaderConfig

type ReaderConfig struct {
	// Type is the type of the reader.
	Type string

	// Path is the path to the file or directory.
	Path string

	// ConnectionString is the connection string for a database.
	ConnectionString string

	// Table is the table name for a database.
	Table string

	// Query is the query to execute for a database.
	Query string

	// BatchSize is the size of batches to read.
	BatchSize int64
}

ReaderConfig provides configuration for creating a reader.

type Reporter

type Reporter interface {
	// Report generates a report from a diff result.
	Report(ctx context.Context, result *DiffResult) (io.Reader, error)
}

Reporter defines an interface for generating reports from diff results.

type WriterConfig

type WriterConfig struct {
	// Type is the type of the writer.
	Type string

	// Path is the path to the file or directory.
	Path string

	// ConnectionString is the connection string for a database.
	ConnectionString string

	// Table is the table name for a database.
	Table string

	// BatchSize is the size of batches to write.
	BatchSize int64
}

WriterConfig provides configuration for creating a writer.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL