gafit

package

v0.0.0-...-1a937e6 Latest Latest Go to latest Published: Mar 1, 2022 License: MIT Imports: 20 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/davidkleiven/gogafit

Links

Open Source Insights

Documentation ¶

Overview ¶

Example ¶

package main

import (
	"fmt"
	"math"
	"math/rand"

	"github.com/MaxHalford/eaopt"
	"github.com/davidkleiven/gogafit/gafit"
	"gonum.org/v1/gonum/mat"
)

// Create a fictitious dataset
func sampleData() gafit.Dataset {
	data := gafit.Dataset{
		X:        mat.NewDense(20, 5, nil),
		Y:        mat.NewVecDense(20, nil),
		ColNames: []string{"const", "x", "x^2", "x^3", "x^4"},
	}

	for i := 0; i < 20; i++ {
		x := 0.1 * float64(i)
		for j := 0; j < 5; j++ {
			data.X.Set(i, j, math.Pow(x, float64(j)))
		}
		data.Y.SetVec(i, 5.0-2.0*x*x*x)
	}
	return data
}

func main() {
	// Set a seed such that the run is deterministic
	rand.Seed(4)

	// Initialize GA with default configuration
	var ga, err = eaopt.NewDefaultGAConfig().NewGA()
	if err != nil {
		fmt.Println(err)
		return
	}

	// Set the number of generations to run for
	ga.NGenerations = 100

	// Add a custom print function to track progress
	ga.Callback = func(ga *eaopt.GA) {
		// Optionally print progress information (commented out for this example)
		// fmt.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness)
	}

	// Initialize a dataset
	data := sampleData()

	// Initialize the linear model factory
	factory := gafit.LinearModelFactory{
		Config: gafit.LinearModelConfig{
			Data: data,

			// We use AICC as a measure of the quality of the model
			Cost: gafit.Aicc,
		},
	}

	// Find the minimum
	err = ga.Minimize(factory.Generate)
	if err != nil {
		fmt.Println(err)
		return
	}

	// Print the selected features
	best := ga.HallOfFame[0].Genome.(*gafit.LinearModel)

	// Run local optimization on the best genome
	res := best.Optimize()
	fmt.Printf("%v\n", data.IncludedFeatures(res.Include))

}

Output:

[const x^3]

Index ¶

Constants
func Aic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64
func Aicc(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64
func AllEqualInt(s1 []int, s2 []int) bool
func Bic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64
func CovMatrix(X *mat.Dense, rss float64) (*mat.SymDense, error)
func DemoCostFuncPython(pyExec string) (string, error)
func Fit(X *mat.Dense, y *mat.VecDense) *mat.VecDense
func FitSVD(X *mat.Dense, y *mat.VecDense) *mat.VecDense
func GAProgressLogger(ga *eaopt.GA)
func GeneralizedCV(rmse float64, X *mat.Dense) float64
func HatMatrix(X *mat.Dense) *mat.Dense
func LogLikelihood(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64
func Pred(X *mat.Dense, coeff *mat.VecDense) *mat.VecDense
func Rmse(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64
func Rss(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64
func SaveModel(fname string, model Model) error
func SavePredictions(fname string, pred []Prediction) error
func Write(fname string, X *mat.Dense, y *mat.VecDense, featNames []string, ...) error
func WriteFile(f *os.File, X *mat.Dense, y *mat.VecDense, featNames []string, ...) error
type CaptureFunction
type CaptureResult
- func NewCaptureResult() CaptureResult
- func (cr CaptureResult) GetFloat(name string) float64
- func (cr CaptureResult) GetInt(name string) int
- func (cr CaptureResult) GetString(name string) string
type CostFunction
type CostFunctionHook
- func NewCostFunctionHook(script string) CostFunctionHook
- func (cfh CostFunctionHook) Cleanup()
- func (cfh CostFunctionHook) Execute(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64
type Dataset
- func AddPoly(cols []int, data Dataset, order int) Dataset
- func Read(fname string, targetName string) (Dataset, error)
- func ReadFile(csvfile *os.File, targetName string) (Dataset, error)
- func (data Dataset) Columns(pattern string) []int
- func (data Dataset) Copy() Dataset
- func (data Dataset) Dot(coeff map[string]float64) *mat.VecDense
- func (data Dataset) IncludedFeatures(indicator []int) []string
- func (data Dataset) IsEqual(other Dataset) bool
- func (data Dataset) NumData() int
- func (data Dataset) NumFeatures() int
- func (data Dataset) Submatrix(names []string) *mat.Dense
type DemoCostFunc
type EBic
- func NewDefaultEBic(maxNumFeat int) EBic
- func (e EBic) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64
type FittedModel
type GABackupCB
- func (gab *GABackupCB) Build() func(ga *eaopt.GA)
type Hook
type LinearModel
- func (l *LinearModel) Clone() eaopt.Genome
- func (l *LinearModel) Crossover(other eaopt.Genome, rng *rand.Rand)
- func (l *LinearModel) Evaluate() (float64, error)
- func (l *LinearModel) GetCoeff() *mat.VecDense
- func (l *LinearModel) IncludedCols() []int
- func (l *LinearModel) IsEmpty() bool
- func (l *LinearModel) IsEqual(other LinearModel) bool
- func (l *LinearModel) Mutate(rng *rand.Rand)
- func (l *LinearModel) MutationRate() float64
- func (l *LinearModel) NumIncluded() int
- func (l *LinearModel) NumSplits() uint
- func (l *LinearModel) Optimize() OptimizeResult
type LinearModelConfig
- func (lmc *LinearModelConfig) GetCostFunction() CostFunction
- func (lmc LinearModelConfig) IsEqual(other LinearModelConfig) bool
- func (lmc LinearModelConfig) LargestModel() int
type LinearModelFactory
- func (lmf *LinearModelFactory) Generate(rng *rand.Rand) eaopt.Genome
type Model
- func NewModel(best eaopt.Individual, dataset Dataset, cost string, datafile string) Model
- func ReadModel(fname string) (Model, error)
type ModelIterator
- func (m *ModelIterator) Flip()
- func (m *ModelIterator) Next() []int
- func (m *ModelIterator) UndoLastFlip()
type OptimizeResult
- func OrthogonalMatchingPursuit(dataset Dataset, cost CostFunction, maxFeatures int) OptimizeResult
- func (or *OptimizeResult) IsEqual(other OptimizeResult) bool
type Prediction
- func GetPredictions(data Dataset, model Model, predData *Dataset) []Prediction
- func ReadPredictions(fname string) ([]Prediction, error)
- func (p Prediction) IsEqual(other Prediction) bool
type PredictionErrorFIC
- func (pef *PredictionErrorFIC) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64
type Score
type SubMatrix
- func (s *SubMatrix) At(i, j int) float64
- func (s *SubMatrix) Dims() (int, int)
- func (s *SubMatrix) T() mat.Matrix

Examples ¶

Package

Constants ¶

View Source

const CostFunctionIdentifier = "GOGAFIT_COST:"

CostFunctionIdentifier is a pattarn that is search for in the output of the hook the floating point number that follows is extracted

Variables ¶

This section is empty.

Functions ¶

func Aic ¶

func Aic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Aic returns Afaike's information criteria

func Aicc ¶

func Aicc(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Aicc returns the corrected Afaike's information criteria

func AllEqualInt ¶

func AllEqualInt(s1 []int, s2 []int) bool

AllEqualInt check if all elements in s1 equals s2

func Bic ¶

func Bic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Bic returns the Bayes information criterion

func CovMatrix ¶

func CovMatrix(X *mat.Dense, rss float64) (*mat.SymDense, error)

CovMatrix calculates the covariance matrix between the coefficients

func DemoCostFuncPython ¶

func DemoCostFuncPython(pyExec string) (string, error)

DemoCostFuncPython generates a demo script for python

func Fit ¶

func Fit(X *mat.Dense, y *mat.VecDense) *mat.VecDense

Fit solves the least square problem

func GAProgressLogger ¶

func GAProgressLogger(ga *eaopt.GA)

GAProgressLogger can be used as a callback to the GA algorithm. It logs the progress of the method

func GeneralizedCV ¶

func GeneralizedCV(rmse float64, X *mat.Dense) float64

GeneralizedCV returns the generalized CV, given by rmse/(1 - Tr(H)/N), where H is the HatMatrix and N is the number of datapoints

func HatMatrix ¶

func HatMatrix(X *mat.Dense) *mat.Dense

HatMatrix returns the matrix that maps training data onto predictions. y = Hy', where y' are training points. In case of linear regression, y = Xc, where c is a coefficient vector that is given by c = (X^TX)^{-1}X^Ty', the hat matrix H = X(X^TX)^{-1}X^T. Internally, H is calculated by using the QR decomposition of R

func LogLikelihood ¶

func LogLikelihood(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

LogLikelihood returns the logarithm of the likelihood function, assuming normal distributed variable

func Pred ¶

func Pred(X *mat.Dense, coeff *mat.VecDense) *mat.VecDense

Pred predicts the outcome of the linear model

func Rmse ¶

func Rmse(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Rmse returns the residual mean square error

func Rss ¶

func Rss(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Rss returns the residual sum of square

func SaveModel ¶

func SaveModel(fname string, model Model) error

SaveModel writes a JSON version of the model to file

func SavePredictions ¶

func SavePredictions(fname string, pred []Prediction) error

SavePredictions stores the predictions in a file

func Write ¶

func Write(fname string, X *mat.Dense, y *mat.VecDense, featNames []string, targetName string) error

Write writes a datset to file. The target values are appended as the last column

func WriteFile ¶

func WriteFile(f *os.File, X *mat.Dense, y *mat.VecDense, featNames []string, targetName string) error

WriteFile writes dataset to file

Types ¶

type CaptureFunction ¶

type CaptureFunction func(out string) (CaptureResult, error)

CaptureFunction is a type used to capture results from a string

type CaptureResult ¶

type CaptureResult struct {
	Floats  map[string]float64
	Ints    map[string]int
	Strings map[string]string
}

CaptureResult is a type used to represent results captured from

func NewCaptureResult ¶

func NewCaptureResult() CaptureResult

NewCaptureResult returns a new initialized instance of CaptureResult

func (CaptureResult) GetFloat ¶

func (cr CaptureResult) GetFloat(name string) float64

GetFloat returns captured float values

func (CaptureResult) GetInt ¶

func (cr CaptureResult) GetInt(name string) int

GetInt returns captured int values

func (CaptureResult) GetString ¶

func (cr CaptureResult) GetString(name string) string

GetString return captured string values

type CostFunction ¶

type CostFunction func(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

CostFunction is a type used to represent cost functions for fitting

type CostFunctionHook ¶

type CostFunctionHook struct {
	Hook    Hook
	TmpFile string
}

CostFunctionHook is a type used to represent external cost functions

func NewCostFunctionHook ¶

func NewCostFunctionHook(script string) CostFunctionHook

NewCostFunctionHook returns a new instance of a cost function

func (CostFunctionHook) Cleanup ¶

func (cfh CostFunctionHook) Cleanup()

Cleanup erases temporary file created by the application

func (CostFunctionHook) Execute ¶

func (cfh CostFunctionHook) Execute(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Execute runs

type Dataset ¶

type Dataset struct {
	X *mat.Dense
	Y *mat.VecDense

	// ColNames gives the name of the "feature" stored in each column of X
	ColNames   []string
	TargetName string
}

Dataset is a type that represents a linear model

func AddPoly ¶

func AddPoly(cols []int, data Dataset, order int) Dataset

AddPoly return a new dataset where polynomial versions of the passed columns are inserted

func Read ¶

func Read(fname string, targetName string) (Dataset, error)

Read dataset from the a file

func ReadFile ¶

func ReadFile(csvfile *os.File, targetName string) (Dataset, error)

ReadFile creates a dataset from the passed file, If targetName is an empty string, the entire file will be added to the X matrix. If targetName is not empty string and is not found in the header, the function will return with an error

func (Dataset) Columns ¶

func (data Dataset) Columns(pattern string) []int

Columns return the column numbers of all features where <pattern> is part of the name

func (Dataset) Copy ¶

func (data Dataset) Copy() Dataset

Copy returns a copy of the dataset

func (Dataset) Dot ¶

func (data Dataset) Dot(coeff map[string]float64) *mat.VecDense

Dot perform dot product between X and a sparse coefficient vector given as a map of strings, where the key is a column name

func (Dataset) IncludedFeatures ¶

func (data Dataset) IncludedFeatures(indicator []int) []string

IncludedFeatures returns the features being included according to the passed indicator. 1: feature is included, 0: feature is not included

func (Dataset) IsEqual ¶

func (data Dataset) IsEqual(other Dataset) bool

IsEqual returns true if the two dataseta are equal

func (Dataset) NumData ¶

func (data Dataset) NumData() int

NumData returns the number of datapoints

func (Dataset) NumFeatures ¶

func (data Dataset) NumFeatures() int

NumFeatures return the number of features

func (Dataset) Submatrix ¶

func (data Dataset) Submatrix(names []string) *mat.Dense

Submatrix returns a submatrix corresponding to columns given

type DemoCostFunc ¶

type DemoCostFunc struct {
	OutputIdentifier string
	PythonExec       string
}

DemoCostFunc is a type holds parameters for demo scripts

type EBic ¶

type EBic struct {
	Gamma          float64
	MaxNumFeatures int
}

EBic is a type used to calculate the extended BIC criterion. An implicit underlying assumption for BIC is that the prior distribution is constant for all models. This may not be feasible when the number of features are large. EBIC tries to penalize large models higher than BIC, by setting the prior distribution inversely proportional to the total number of models with a given size. If we have N features, and k featurea are selected then the prior p(s) is proportional to tau^{gamma}, where tau is the total number of models with that size (e.g. tau = N!/(k!(N-k)!)) and 0 <= gamma <= 1 is a tuning constnat. If gamma is zero, then EBIC is equal to BIC

func NewDefaultEBic ¶

func NewDefaultEBic(maxNumFeat int) EBic

NewDefaultEBic returns a new Ebic function

func (EBic) Evaluate ¶

func (e EBic) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Evaluate evaluates the EBic criterion

type FittedModel ¶

type FittedModel struct {
	Rows  int
	Cols  int
	X     []float64
	Y     []float64
	Coeff []float64
	Names []string
}

FittedModel is a type that holds the design matrix, the target values and the coefficients

type GABackupCB ¶

type GABackupCB struct {
	Cost       string
	Dataset    Dataset
	DataFile   string
	Rate       uint
	BackupFile string
}

GABackupCB is a default type used to construct a default backup function

func (*GABackupCB) Build ¶

func (gab *GABackupCB) Build() func(ga *eaopt.GA)

Build constructs the callback function

type Hook ¶

type Hook struct {
	Script  string
	Capture CaptureFunction
}

Hook is a type that runs the script and capture results from the output using the Capture function

type LinearModel ¶

type LinearModel struct {
	Config  LinearModelConfig
	Include []int
}

LinearModel represent a genome

func (*LinearModel) Clone ¶

func (l *LinearModel) Clone() eaopt.Genome

Clone create a copy

func (*LinearModel) Crossover ¶

func (l *LinearModel) Crossover(other eaopt.Genome, rng *rand.Rand)

Crossover performs a cross over

func (*LinearModel) Evaluate ¶

func (l *LinearModel) Evaluate() (float64, error)

Evaluate evaluates the fitness

func (*LinearModel) GetCoeff ¶

func (l *LinearModel) GetCoeff() *mat.VecDense

GetCoeff return the coefficients corresponding to the current selection

func (*LinearModel) IncludedCols ¶

func (l *LinearModel) IncludedCols() []int

IncludedCols return the index of the columns that are included according to the 1/0 values in inclue (1: included, 0: excluded)

func (*LinearModel) IsEmpty ¶

func (l *LinearModel) IsEmpty() bool

IsEmpty returns true if the model contains no features

func (*LinearModel) IsEqual ¶

func (l *LinearModel) IsEqual(other LinearModel) bool

IsEqual returns true of the two models are equal

func (*LinearModel) Mutate ¶

func (l *LinearModel) Mutate(rng *rand.Rand)

Mutate introduces mutations

func (*LinearModel) MutationRate ¶

func (l *LinearModel) MutationRate() float64

MutationRate returns the mutation rate. If not specified in Config (e.g. 0.0), a default value of 0.5 is used

func (*LinearModel) NumIncluded ¶

func (l *LinearModel) NumIncluded() int

NumIncluded returns the number of included columns

func (*LinearModel) NumSplits ¶

func (l *LinearModel) NumSplits() uint

NumSplits returns the number of splits used in cross over. If not, set 2 is used as default

func (*LinearModel) Optimize ¶

func (l *LinearModel) Optimize() OptimizeResult

Optimize flips all inclusions in. After a call to this function, the included features are affected and set to the best genome

type LinearModelConfig ¶

type LinearModelConfig struct {
	Data         Dataset
	Cost         CostFunction
	MutationRate float64
	NumSplits    uint

	// MaxFeatToDataRatio specifies the maximum value of #feat/#data. If not given,
	// a default value of 0.5 is used
	MaxFeatToDataRatio float64
}

LinearModelConfig contains static configuration for a linear model It contains meta-information needed to fully define a LinearModel

func (*LinearModelConfig) GetCostFunction ¶

func (lmc *LinearModelConfig) GetCostFunction() CostFunction

GetCostFunction returns the cost function. If not given, AICC is used as default

func (LinearModelConfig) IsEqual ¶

func (lmc LinearModelConfig) IsEqual(other LinearModelConfig) bool

IsEqual if other is equal to lmc, return true. Otherwise, return false.

func (LinearModelConfig) LargestModel ¶

func (lmc LinearModelConfig) LargestModel() int

LargestModel returns the largest model consistent with the feature to data ratio

type LinearModelFactory ¶

type LinearModelFactory struct {
	Config LinearModelConfig

	// Probability of initialition each features. If not, set default value of 0.5
	// is used. Example: a value of 0.2 will lead to 20% of all features being included
	// in the initial pool
	Prob float64
}

LinearModelFactory produces random models

func (*LinearModelFactory) Generate ¶

func (lmf *LinearModelFactory) Generate(rng *rand.Rand) eaopt.Genome

Generate creates a new random linear model

type Model ¶

type Model struct {
	Datafile   string
	TargetName string
	Coeffs     map[string]float64
	Score      Score
}

Model is convenience type used to store information about a model

func NewModel ¶

func NewModel(best eaopt.Individual, dataset Dataset, cost string, datafile string) Model

NewModel creates a new fitted model from the best individual of a GA run

func ReadModel ¶

func ReadModel(fname string) (Model, error)

ReadModel reads a model from a JSON file

type ModelIterator ¶

type ModelIterator struct {
	Include []int
	MaxSize int
	// contains filtered or unexported fields
}

ModelIterator iterates through all models by sequentually flipping bits

func (*ModelIterator) Flip ¶

func (m *ModelIterator) Flip()

Flip flips current

func (*ModelIterator) Next ¶

func (m *ModelIterator) Next() []int

Next returns the next model

func (*ModelIterator) UndoLastFlip ¶

func (m *ModelIterator) UndoLastFlip()

UndoLastFlip undo the prvious flip

type OptimizeResult ¶

type OptimizeResult struct {
	Score   float64
	Include []int
	Coeff   *mat.VecDense
}

OptimizeResult is returned by local optimization of the linear model

func OrthogonalMatchingPursuit ¶

func OrthogonalMatchingPursuit(dataset Dataset, cost CostFunction, maxFeatures int) OptimizeResult

OrthogonalMatchingPursuit optimizes the cost function by selecting the model that leads to the largest decrease in the cost function

func (*OptimizeResult) IsEqual ¶

func (or *OptimizeResult) IsEqual(other OptimizeResult) bool

IsEqual returns ture if the two optimize results are equal

type Prediction ¶

type Prediction struct {
	Value float64
	Std   float64
}

Prediction is a type that represent a prediction (the expected valud and the standard deviation)

func GetPredictions ¶

func GetPredictions(data Dataset, model Model, predData *Dataset) []Prediction

GetPredictions together with the standard deviations for all data in predData. If predData is nil, data will be used (e.g. in sample prediction errors)

func ReadPredictions ¶

func ReadPredictions(fname string) ([]Prediction, error)

ReadPredictions reads the predictions from a csv file (same as stored by SavePredictions)

func (Prediction) IsEqual ¶

func (p Prediction) IsEqual(other Prediction) bool

IsEqual returns true of the two predictions are equal

type PredictionErrorFIC ¶

type PredictionErrorFIC struct {
	Data []int
}

PredictionErrorFIC tries to select the model that has the highest precision for a subset of the data

func (*PredictionErrorFIC) Evaluate ¶

func (pef *PredictionErrorFIC) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Evaluate evaluates the focused information criteria

type Score ¶

type Score struct {
	Name  string
	Value float64
}

Score is a conveniene type used to collect information about the quality of a model

type SubMatrix ¶

type SubMatrix struct {
	X    mat.Matrix
	Rows int
	Cols int
}

SubMatrix creates a sub-view of a matrix. The view contains the upper left corner starting from element (0, 0) and ending at (Rows, Cols)

func (*SubMatrix) At ¶

func (s *SubMatrix) At(i, j int) float64

At returns the value of element (i, j)

func (*SubMatrix) Dims ¶

func (s *SubMatrix) Dims() (int, int)

Dims returns the dimansion of the matrix

func (*SubMatrix) T ¶

func (s *SubMatrix) T() mat.Matrix

T returns the transpose of the matrix

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL