task

package
v0.0.0-...-0d40728 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 12, 2021 License: Apache-2.0 Imports: 28 Imported by: 9

Documentation

Index

Constants

View Source
const (

	// OutlierAnomaly is the category name used for anomalies discovered by outlier detection
	OutlierAnomaly = "anomaly"
	// OutlierRegular is the category name used for regular values as discovered by outlier detection
	OutlierRegular = "regular"
)
View Source
const (
	// DefaultSeparator is the default separator to use when dealing with groupings.
	DefaultSeparator = "_"
)

Variables

This section is empty.

Functions

func Classify

func Classify(schemaPath string, dataset string, config *IngestTaskConfig) (string, error)

Classify will classify the dataset using a primitive.

func Clean

func Clean(schemaFile string, dataset string, params *IngestParams, config *IngestTaskConfig) (string, error)

Clean will clean bad data for further processing.

func CloneDataset

func CloneDataset(sourceDatasetID string, cloneDatasetID string, cloneFolder string,
	metaStorage api.MetadataStorage, dataStorage api.DataStorage, filterParams *api.FilterParams) error

CloneDataset clones a dataset in metadata storage, data storage and on disk.

func CopyDiskDataset

func CopyDiskDataset(existingURI string, newURI string, newDatasetID string, newStorageName string) (*api.DiskDataset, error)

CopyDiskDataset copies an existing dataset on disk to a new location, updating the ID and the storage name.

func CreateComposedVariable

func CreateComposedVariable(metaStorage api.MetadataStorage, dataStorage api.DataStorage,
	dataset string, storageName string, composedVarName string, composedVarDisplayName string, sourceVarNames []string) error

CreateComposedVariable creates a new variable to use as group id.

func CreateDataset

func CreateDataset(dataset string, datasetCtor DatasetConstructor, outputPath string, config *env.Config) (string, string, error)

CreateDataset structures a raw csv file into a valid D3M dataset.

func CreateDatasetFromResult

func CreateDatasetFromResult(newDatasetName string, predictionDataset string, sourceDataset string, features []string,
	targetName string, resultURI string, datasetDescription string, metaStorage api.MetadataStorage, dataStorage api.DataStorage, config env.Config) (string, error)

CreateDatasetFromResult creates a new dataset based on a result set & the input to the model

func CreateFeaturizedDatasetID

func CreateFeaturizedDatasetID(datasetID string) string

CreateFeaturizedDatasetID creates a dataset id for a learning dataset.

func DeleteDataset

func DeleteDataset(ds *api.Dataset, metaStorage api.MetadataStorage, dataStorage api.DataStorage, softDelete bool) error

DeleteDataset deletes a dataset from metadata and, if not a soft delete, from the database.

func DeleteQueryCache

func DeleteQueryCache(datasetID string)

DeleteQueryCache deletes the query cache folder if it exists.

func ExportDataset

func ExportDataset(dataset string, metaStorage api.MetadataStorage, dataStorage api.DataStorage, filterParams *api.FilterParams) (string, string, error)

ExportDataset extracts a dataset from the database and metadata storage, writing it to disk in D3M dataset format.

func Featurize

func Featurize(originalSchemaFile string, schemaFile string, data api.DataStorage, storage api.MetadataStorage, dataset string, config *IngestTaskConfig) error

Featurize provides a separate step for featurzing data so that it can be called independently of the ingest step.

func FeaturizeDataset

func FeaturizeDataset(originalSchemaFile string, schemaFile string, dataset string, metaStorage api.MetadataStorage, config *IngestTaskConfig) (string, string, error)

FeaturizeDataset creates a featurized output of the data that can be used in simplified pipelines.

func Format

func Format(schemaFile string, dataset string, config *IngestTaskConfig) (string, error)

Format will format a dataset to have the required structures for D3M.

func GeocodeForwardDataset

func GeocodeForwardDataset(schemaFile string, dataset string, config *IngestTaskConfig) (string, error)

GeocodeForwardDataset geocodes fields that are types of locations. The results are append to the dataset and the whole is output to disk.

func GetUniqueOutputFolder

func GetUniqueOutputFolder(dataset string, outputPath string) (string, error)

GetUniqueOutputFolder produces a unique name for a dataset in a folder.

func ImportPredictionDataset

func ImportPredictionDataset(params *PredictParams) (string, string, error)

ImportPredictionDataset imports a dataset to be used for predictions.

func Ingest

func Ingest(originalSchemaFile string, schemaFile string, data api.DataStorage,
	storage api.MetadataStorage, params *IngestParams, config *IngestTaskConfig, steps *IngestSteps) (string, error)

Ingest the metadata to ES and the data to Postgres.

func IngestMetadata

func IngestMetadata(originalSchemaFile string, schemaFile string, data api.DataStorage,
	storage api.MetadataStorage, params *IngestParams, config *IngestTaskConfig, steps *IngestSteps) (string, error)

IngestMetadata ingests the data to ES.

func IngestPostgres

func IngestPostgres(originalSchemaFile string, schemaFile string, params *IngestParams, config *IngestTaskConfig, steps *IngestSteps) error

IngestPostgres ingests a dataset to PG storage.

func IngestPredictionDataset

func IngestPredictionDataset(params *PredictParams) error

IngestPredictionDataset ingests a dataset to be used for predictions.

func JoinDatamart

func JoinDatamart(joinLeft *JoinSpec, joinRight *JoinSpec, rightOrigin *model.DatasetOrigin) (string, *apiModel.FilteredData, error)

JoinDatamart will make all your dreams come true.

func JoinDistil

func JoinDistil(dataStorage apiModel.DataStorage, joinLeft *JoinSpec, joinRight *JoinSpec, joinPairs []*JoinPair, joinType string, returnRaw bool) (string, *apiModel.FilteredData, error)

JoinDistil will bring misery.

func LoadFittedSolution

func LoadFittedSolution(fittedSolutionURI string, solutionStorage api.SolutionStorage, metadataStorage api.MetadataStorage) (string, error)

LoadFittedSolution loads a fitted solution via TA2TA3 API.

func LoadSolution

func LoadSolution(solutionURI string) (string, error)

LoadSolution loads an unfitted solution via TA2TA3 API.

func Merge

func Merge(schemaFile string, dataset string, config *IngestTaskConfig) (string, error)

Merge will merge data resources into a single data resource.

func NewDefaultClient

func NewDefaultClient(config env.Config, userAgent string, discoveryLogger middleware.MethodLogger) (*compute.Client, error)

NewDefaultClient creates a new client to use when submitting pipelines.

func Predict

func Predict(params *PredictParams) (string, error)

Predict processes input data to generate predictions.

func PrepExistingPredictionDataset

func PrepExistingPredictionDataset(params *PredictParams) (string, string, error)

PrepExistingPredictionDataset sets up an existing dataset to be usable for predictions.

func Query

func Query(params QueryParams) (map[string]interface{}, error)

Query uses a query pipeline to rank data by nearness to a target.

func Rank

func Rank(schemaPath string, dataset string, config *IngestTaskConfig) (string, error)

Rank will rank the dataset using a primitive.

func Sample

func Sample(originalSchemaFile string, schemaFile string, dataset string, config *IngestTaskConfig) (string, bool, int, error)

Sample takes a sample of the dataset since larger datasets can lead to broken user experience through long lasting TA2 processing.

func SaveFittedSolution

func SaveFittedSolution(fittedSolutionID string, modelName string, modelDescription string, solutionStorage api.SolutionStorage, metadataStorage api.MetadataStorage) (*api.ExportedModel, error)

SaveFittedSolution saves a fitted solution to disk via TA2TA3 API.

func SaveSolution

func SaveSolution(solutionID string) (string, error)

SaveSolution saves a solution to disk via TA2TA3 API.

func SetClient

func SetClient(computeClient *compute.Client)

SetClient sets the compute client to use when invoking primitives.

func SetGroups

func SetGroups(datasetID string, rawGroupings []map[string]interface{}, data api.DataStorage, meta api.MetadataStorage, config *IngestTaskConfig) error

SetGroups updates the dataset metadata (as stored) to capture group information.

func Summarize

func Summarize(schemaPath string, dataset string, config *IngestTaskConfig) (string, error)

Summarize will summarize the dataset using a primitive.

func TargetRank

func TargetRank(dataset *api.Dataset, target string, features []*model.Variable, source metadata.DatasetSource) (map[string]float64, error)

TargetRank will rank the dataset relative to a target variable using a primitive.

func UpdateExtremas

func UpdateExtremas(dataset string, metaStorage api.MetadataStorage, dataStorage api.DataStorage) error

UpdateExtremas will update every field's extremas in the specified dataset.

func VerifySuggestedTypes

func VerifySuggestedTypes(dataset string, dataStorage api.DataStorage, metaStorage api.MetadataStorage) error

VerifySuggestedTypes checks expands the suggested types to include all valid types the database storage can support.

func VerticalConcat

func VerticalConcat(dataStorage apiModel.DataStorage, joinLeft *JoinSpec, joinRight *JoinSpec) (string, *apiModel.FilteredData, error)

VerticalConcat will bring mastery.

Types

type ClusterPoint

type ClusterPoint struct {
	D3MIndex    string
	SourceField string
	Label       string
}

ClusterPoint contains data that has been clustered.

func Cluster

func Cluster(dataset *api.Dataset, variable string, useKMeans bool, clusterCount int) (bool, []*ClusterPoint, error)

Cluster will cluster the dataset fields using a primitive.

func ClusterExplainOutput

func ClusterExplainOutput(variable string, resultURI string, explainURI string, config *env.Config) (bool, []*ClusterPoint, error)

ClusterExplainOutput clusters the explained output from a model.

type DatasetConstructor

type DatasetConstructor interface {
	CreateDataset(rootDataPath string, datasetName string, config *env.Config) (*serialization.RawDataset, error)
	GetDefinitiveTypes() []*model.Variable
	CleanupTempFiles()
}

DatasetConstructor is used to build a dataset.

type FeatureRequest

type FeatureRequest struct {
	SourceVariableName  string
	FeatureVariableName string
	OutputVariableName  string
	Variable            *model.Variable
	Step                *description.FullySpecifiedPipeline
	Clustering          bool
}

FeatureRequest captures the properties of a request to a primitive.

type GeocodedPoint

type GeocodedPoint struct {
	D3MIndex    string
	SourceField string
	Latitude    string
	Longitude   string
}

GeocodedPoint contains data that has been geocoded.

func GeocodeForward

func GeocodeForward(datasetInputDir string, dataset string, variable *model.Variable) ([]*GeocodedPoint, error)

GeocodeForward will geocode a column into lat & lon values.

type ImportanceResult

type ImportanceResult struct {
	Path     string    `json:"path"`
	Features []float64 `json:"features"`
}

ImportanceResult is the result from a ranking operation.

type IngestParams

type IngestParams struct {
	Source          metadata.DatasetSource
	DataCtor        api.DataStorageCtor
	MetaCtor        api.MetadataStorageCtor
	ID              string
	Origins         []*model.DatasetOrigin
	Type            api.DatasetType
	Path            string
	RawGroupings    []map[string]interface{}
	IndexFields     []string
	DefinitiveTypes map[string]*model.Variable
}

IngestParams contains the parameters needed to ingest a dataset

func (*IngestParams) GetSchemaDocPath

func (i *IngestParams) GetSchemaDocPath() string

GetSchemaDocPath returns the schema path to use when ingesting.

type IngestResult

type IngestResult struct {
	DatasetID string
	Sampled   bool
	RowCount  int
}

IngestResult captures the result of a dataset ingest process.

func IngestDataset

func IngestDataset(params *IngestParams, config *IngestTaskConfig, steps *IngestSteps) (*IngestResult, error)

IngestDataset executes the complete ingest process for the specified dataset.

type IngestSteps

type IngestSteps struct {
	ClassificationOverwrite bool
	VerifyMetadata          bool
	FallbackMerged          bool
	CreateMetadataTables    bool
	CheckMatch              bool
	SkipFeaturization       bool
}

IngestSteps is a collection of parameters that specify ingest behaviour.

type IngestTaskConfig

type IngestTaskConfig struct {
	DatasetBatchSize                 int
	HasHeader                        bool
	FeaturizationEnabled             bool
	GeocodingEnabled                 bool
	ClassificationOutputPathRelative string
	ClassificationEnabled            bool
	RankingOutputPathRelative        string
	DatabasePassword                 string
	DatabaseUser                     string
	Database                         string
	DatabaseHost                     string
	DatabasePort                     int
	DatabaseBatchSize                int
	DatabaseLogLevel                 string
	ImputeEnabled                    bool
	SummaryOutputPathRelative        string
	SummaryMachineOutputPathRelative string
	SummaryEnabled                   bool
	ESEndpoint                       string
	HardFail                         bool
	IngestOverwrite                  bool
	SampleRowLimit                   int
}

IngestTaskConfig captures the necessary configuration for an data ingest.

func NewConfig

func NewConfig(config env.Config) *IngestTaskConfig

NewConfig creates an ingest config based on a distil config.

type JoinPair

type JoinPair struct {
	Left             string
	Right            string
	Accuracy         float64
	AbsoluteAccuracy bool
}

JoinPair captures the information required for a single join relationship.

type JoinSpec

type JoinSpec struct {
	DatasetID        string
	DatasetPath      string
	DatasetSource    ingestMetadata.DatasetSource
	ExistingMetadata *model.Metadata
	UpdatedVariables []*model.Variable
}

JoinSpec stores information for one side of a join operation.

type OutlierPoint

type OutlierPoint struct {
	D3MIndex string
	Label    string
}

OutlierPoint contains whether or not a datapoint is an outlier or not

func OutlierDetection

func OutlierDetection(dataset *api.Dataset, variable string) ([]*OutlierPoint, error)

OutlierDetection finds outliers in either tabular or remote sensing data

type PredictParams

type PredictParams struct {
	Meta               *model.Metadata
	LearningDataMeta   *model.Metadata
	SourceDataset      *api.Dataset
	Dataset            string
	SchemaPath         string
	SourceDatasetID    string
	SolutionID         string
	FittedSolutionID   string
	DatasetConstructor DatasetConstructor
	OutputPath         string
	IndexFields        []string
	Target             *model.Variable
	MetaStorage        api.MetadataStorage
	DataStorage        api.DataStorage
	SolutionStorage    api.SolutionStorage
	ModelStorage       api.ExportedModelStorage
	IngestConfig       *IngestTaskConfig
	Config             *env.Config
}

PredictParams contains all parameters passed to the predict function.

type PredictionTimeseriesDataset

type PredictionTimeseriesDataset struct {
	// contains filtered or unexported fields
}

PredictionTimeseriesDataset has the paramaters necessary to create a timeseries dataset from minimal information.

func NewPredictionTimeseriesDataset

func NewPredictionTimeseriesDataset(params *PredictParams, interval float64, count int) (*PredictionTimeseriesDataset, error)

NewPredictionTimeseriesDataset creates prediction timeseries dataset.

func (*PredictionTimeseriesDataset) CleanupTempFiles

func (p *PredictionTimeseriesDataset) CleanupTempFiles()

CleanupTempFiles does nothing.

func (*PredictionTimeseriesDataset) CreateDataset

func (p *PredictionTimeseriesDataset) CreateDataset(rootDataPath string, datasetName string, config *env.Config) (*serialization.RawDataset, error)

CreateDataset creates a raw dataset based on minimum timeseries parameters.

func (*PredictionTimeseriesDataset) GetDefinitiveTypes

func (p *PredictionTimeseriesDataset) GetDefinitiveTypes() []*model.Variable

GetDefinitiveTypes returns an empty list as definitive types.

type QueryParams

type QueryParams struct {
	Dataset     string
	TargetName  string
	DataStorage api.DataStorage
	MetaStorage api.MetadataStorage
	Filters     *api.FilterParams
}

QueryParams helper struct to simplify query task calling.

type SummaryResult

type SummaryResult struct {
	Summary string `json:"summary"`
}

SummaryResult represents a summary result.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL