gafit

package
v0.0.0-...-1a937e6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 1, 2022 License: MIT Imports: 20 Imported by: 0

Documentation

Overview

Example
package main

import (
	"fmt"
	"math"
	"math/rand"

	"github.com/MaxHalford/eaopt"
	"github.com/davidkleiven/gogafit/gafit"
	"gonum.org/v1/gonum/mat"
)

// Create a fictitious dataset
func sampleData() gafit.Dataset {
	data := gafit.Dataset{
		X:        mat.NewDense(20, 5, nil),
		Y:        mat.NewVecDense(20, nil),
		ColNames: []string{"const", "x", "x^2", "x^3", "x^4"},
	}

	for i := 0; i < 20; i++ {
		x := 0.1 * float64(i)
		for j := 0; j < 5; j++ {
			data.X.Set(i, j, math.Pow(x, float64(j)))
		}
		data.Y.SetVec(i, 5.0-2.0*x*x*x)
	}
	return data
}

func main() {
	// Set a seed such that the run is deterministic
	rand.Seed(4)

	// Initialize GA with default configuration
	var ga, err = eaopt.NewDefaultGAConfig().NewGA()
	if err != nil {
		fmt.Println(err)
		return
	}

	// Set the number of generations to run for
	ga.NGenerations = 100

	// Add a custom print function to track progress
	ga.Callback = func(ga *eaopt.GA) {
		// Optionally print progress information (commented out for this example)
		// fmt.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness)
	}

	// Initialize a dataset
	data := sampleData()

	// Initialize the linear model factory
	factory := gafit.LinearModelFactory{
		Config: gafit.LinearModelConfig{
			Data: data,

			// We use AICC as a measure of the quality of the model
			Cost: gafit.Aicc,
		},
	}

	// Find the minimum
	err = ga.Minimize(factory.Generate)
	if err != nil {
		fmt.Println(err)
		return
	}

	// Print the selected features
	best := ga.HallOfFame[0].Genome.(*gafit.LinearModel)

	// Run local optimization on the best genome
	res := best.Optimize()
	fmt.Printf("%v\n", data.IncludedFeatures(res.Include))

}
Output:

[const x^3]

Index

Examples

Constants

View Source
const CostFunctionIdentifier = "GOGAFIT_COST:"

CostFunctionIdentifier is a pattarn that is search for in the output of the hook the floating point number that follows is extracted

Variables

This section is empty.

Functions

func Aic

func Aic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Aic returns Afaike's information criteria

func Aicc

func Aicc(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Aicc returns the corrected Afaike's information criteria

func AllEqualInt

func AllEqualInt(s1 []int, s2 []int) bool

AllEqualInt check if all elements in s1 equals s2

func Bic

func Bic(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Bic returns the Bayes information criterion

func CovMatrix

func CovMatrix(X *mat.Dense, rss float64) (*mat.SymDense, error)

CovMatrix calculates the covariance matrix between the coefficients

func DemoCostFuncPython

func DemoCostFuncPython(pyExec string) (string, error)

DemoCostFuncPython generates a demo script for python

func Fit

func Fit(X *mat.Dense, y *mat.VecDense) *mat.VecDense

Fit solves the least square problem

func FitSVD

func FitSVD(X *mat.Dense, y *mat.VecDense) *mat.VecDense

FitSVD returns the solution of X*c = y

func GAProgressLogger

func GAProgressLogger(ga *eaopt.GA)

GAProgressLogger can be used as a callback to the GA algorithm. It logs the progress of the method

func GeneralizedCV

func GeneralizedCV(rmse float64, X *mat.Dense) float64

GeneralizedCV returns the generalized CV, given by rmse/(1 - Tr(H)/N), where H is the HatMatrix and N is the number of datapoints

func HatMatrix

func HatMatrix(X *mat.Dense) *mat.Dense

HatMatrix returns the matrix that maps training data onto predictions. y = Hy', where y' are training points. In case of linear regression, y = Xc, where c is a coefficient vector that is given by c = (X^TX)^{-1}X^Ty', the hat matrix H = X(X^TX)^{-1}X^T. Internally, H is calculated by using the QR decomposition of R

func LogLikelihood

func LogLikelihood(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

LogLikelihood returns the logarithm of the likelihood function, assuming normal distributed variable

func Pred

func Pred(X *mat.Dense, coeff *mat.VecDense) *mat.VecDense

Pred predicts the outcome of the linear model

func Rmse

func Rmse(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Rmse returns the residual mean square error

func Rss

func Rss(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Rss returns the residual sum of square

func SaveModel

func SaveModel(fname string, model Model) error

SaveModel writes a JSON version of the model to file

func SavePredictions

func SavePredictions(fname string, pred []Prediction) error

SavePredictions stores the predictions in a file

func Write

func Write(fname string, X *mat.Dense, y *mat.VecDense, featNames []string, targetName string) error

Write writes a datset to file. The target values are appended as the last column

func WriteFile

func WriteFile(f *os.File, X *mat.Dense, y *mat.VecDense, featNames []string, targetName string) error

WriteFile writes dataset to file

Types

type CaptureFunction

type CaptureFunction func(out string) (CaptureResult, error)

CaptureFunction is a type used to capture results from a string

type CaptureResult

type CaptureResult struct {
	Floats  map[string]float64
	Ints    map[string]int
	Strings map[string]string
}

CaptureResult is a type used to represent results captured from

func NewCaptureResult

func NewCaptureResult() CaptureResult

NewCaptureResult returns a new initialized instance of CaptureResult

func (CaptureResult) GetFloat

func (cr CaptureResult) GetFloat(name string) float64

GetFloat returns captured float values

func (CaptureResult) GetInt

func (cr CaptureResult) GetInt(name string) int

GetInt returns captured int values

func (CaptureResult) GetString

func (cr CaptureResult) GetString(name string) string

GetString return captured string values

type CostFunction

type CostFunction func(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

CostFunction is a type used to represent cost functions for fitting

type CostFunctionHook

type CostFunctionHook struct {
	Hook    Hook
	TmpFile string
}

CostFunctionHook is a type used to represent external cost functions

func NewCostFunctionHook

func NewCostFunctionHook(script string) CostFunctionHook

NewCostFunctionHook returns a new instance of a cost function

func (CostFunctionHook) Cleanup

func (cfh CostFunctionHook) Cleanup()

Cleanup erases temporary file created by the application

func (CostFunctionHook) Execute

func (cfh CostFunctionHook) Execute(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Execute runs

type Dataset

type Dataset struct {
	X *mat.Dense
	Y *mat.VecDense

	// ColNames gives the name of the "feature" stored in each column of X
	ColNames   []string
	TargetName string
}

Dataset is a type that represents a linear model

func AddPoly

func AddPoly(cols []int, data Dataset, order int) Dataset

AddPoly return a new dataset where polynomial versions of the passed columns are inserted

func Read

func Read(fname string, targetName string) (Dataset, error)

Read dataset from the a file

func ReadFile

func ReadFile(csvfile *os.File, targetName string) (Dataset, error)

ReadFile creates a dataset from the passed file, If targetName is an empty string, the entire file will be added to the X matrix. If targetName is not empty string and is not found in the header, the function will return with an error

func (Dataset) Columns

func (data Dataset) Columns(pattern string) []int

Columns return the column numbers of all features where <pattern> is part of the name

func (Dataset) Copy

func (data Dataset) Copy() Dataset

Copy returns a copy of the dataset

func (Dataset) Dot

func (data Dataset) Dot(coeff map[string]float64) *mat.VecDense

Dot perform dot product between X and a sparse coefficient vector given as a map of strings, where the key is a column name

func (Dataset) IncludedFeatures

func (data Dataset) IncludedFeatures(indicator []int) []string

IncludedFeatures returns the features being included according to the passed indicator. 1: feature is included, 0: feature is not included

func (Dataset) IsEqual

func (data Dataset) IsEqual(other Dataset) bool

IsEqual returns true if the two dataseta are equal

func (Dataset) NumData

func (data Dataset) NumData() int

NumData returns the number of datapoints

func (Dataset) NumFeatures

func (data Dataset) NumFeatures() int

NumFeatures return the number of features

func (Dataset) Submatrix

func (data Dataset) Submatrix(names []string) *mat.Dense

Submatrix returns a submatrix corresponding to columns given

type DemoCostFunc

type DemoCostFunc struct {
	OutputIdentifier string
	PythonExec       string
}

DemoCostFunc is a type holds parameters for demo scripts

type EBic

type EBic struct {
	Gamma          float64
	MaxNumFeatures int
}

EBic is a type used to calculate the extended BIC criterion. An implicit underlying assumption for BIC is that the prior distribution is constant for all models. This may not be feasible when the number of features are large. EBIC tries to penalize large models higher than BIC, by setting the prior distribution inversely proportional to the total number of models with a given size. If we have N features, and k featurea are selected then the prior p(s) is proportional to tau^{gamma}, where tau is the total number of models with that size (e.g. tau = N!/(k!(N-k)!)) and 0 <= gamma <= 1 is a tuning constnat. If gamma is zero, then EBIC is equal to BIC

func NewDefaultEBic

func NewDefaultEBic(maxNumFeat int) EBic

NewDefaultEBic returns a new Ebic function

func (EBic) Evaluate

func (e EBic) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense, names []string) float64

Evaluate evaluates the EBic criterion

type FittedModel

type FittedModel struct {
	Rows  int
	Cols  int
	X     []float64
	Y     []float64
	Coeff []float64
	Names []string
}

FittedModel is a type that holds the design matrix, the target values and the coefficients

type GABackupCB

type GABackupCB struct {
	Cost       string
	Dataset    Dataset
	DataFile   string
	Rate       uint
	BackupFile string
}

GABackupCB is a default type used to construct a default backup function

func (*GABackupCB) Build

func (gab *GABackupCB) Build() func(ga *eaopt.GA)

Build constructs the callback function

type Hook

type Hook struct {
	Script  string
	Capture CaptureFunction
}

Hook is a type that runs the script and capture results from the output using the Capture function

type LinearModel

type LinearModel struct {
	Config  LinearModelConfig
	Include []int
}

LinearModel represent a genome

func (*LinearModel) Clone

func (l *LinearModel) Clone() eaopt.Genome

Clone create a copy

func (*LinearModel) Crossover

func (l *LinearModel) Crossover(other eaopt.Genome, rng *rand.Rand)

Crossover performs a cross over

func (*LinearModel) Evaluate

func (l *LinearModel) Evaluate() (float64, error)

Evaluate evaluates the fitness

func (*LinearModel) GetCoeff

func (l *LinearModel) GetCoeff() *mat.VecDense

GetCoeff return the coefficients corresponding to the current selection

func (*LinearModel) IncludedCols

func (l *LinearModel) IncludedCols() []int

IncludedCols return the index of the columns that are included according to the 1/0 values in inclue (1: included, 0: excluded)

func (*LinearModel) IsEmpty

func (l *LinearModel) IsEmpty() bool

IsEmpty returns true if the model contains no features

func (*LinearModel) IsEqual

func (l *LinearModel) IsEqual(other LinearModel) bool

IsEqual returns true of the two models are equal

func (*LinearModel) Mutate

func (l *LinearModel) Mutate(rng *rand.Rand)

Mutate introduces mutations

func (*LinearModel) MutationRate

func (l *LinearModel) MutationRate() float64

MutationRate returns the mutation rate. If not specified in Config (e.g. 0.0), a default value of 0.5 is used

func (*LinearModel) NumIncluded

func (l *LinearModel) NumIncluded() int

NumIncluded returns the number of included columns

func (*LinearModel) NumSplits

func (l *LinearModel) NumSplits() uint

NumSplits returns the number of splits used in cross over. If not, set 2 is used as default

func (*LinearModel) Optimize

func (l *LinearModel) Optimize() OptimizeResult

Optimize flips all inclusions in. After a call to this function, the included features are affected and set to the best genome

type LinearModelConfig

type LinearModelConfig struct {
	Data         Dataset
	Cost         CostFunction
	MutationRate float64
	NumSplits    uint

	// MaxFeatToDataRatio specifies the maximum value of #feat/#data. If not given,
	// a default value of 0.5 is used
	MaxFeatToDataRatio float64
}

LinearModelConfig contains static configuration for a linear model It contains meta-information needed to fully define a LinearModel

func (*LinearModelConfig) GetCostFunction

func (lmc *LinearModelConfig) GetCostFunction() CostFunction

GetCostFunction returns the cost function. If not given, AICC is used as default

func (LinearModelConfig) IsEqual

func (lmc LinearModelConfig) IsEqual(other LinearModelConfig) bool

IsEqual if other is equal to lmc, return true. Otherwise, return false.

func (LinearModelConfig) LargestModel

func (lmc LinearModelConfig) LargestModel() int

LargestModel returns the largest model consistent with the feature to data ratio

type LinearModelFactory

type LinearModelFactory struct {
	Config LinearModelConfig

	// Probability of initialition each features. If not, set default value of 0.5
	// is used. Example: a value of 0.2 will lead to 20% of all features being included
	// in the initial pool
	Prob float64
}

LinearModelFactory produces random models

func (*LinearModelFactory) Generate

func (lmf *LinearModelFactory) Generate(rng *rand.Rand) eaopt.Genome

Generate creates a new random linear model

type Model

type Model struct {
	Datafile   string
	TargetName string
	Coeffs     map[string]float64
	Score      Score
}

Model is convenience type used to store information about a model

func NewModel

func NewModel(best eaopt.Individual, dataset Dataset, cost string, datafile string) Model

NewModel creates a new fitted model from the best individual of a GA run

func ReadModel

func ReadModel(fname string) (Model, error)

ReadModel reads a model from a JSON file

type ModelIterator

type ModelIterator struct {
	Include []int
	MaxSize int
	// contains filtered or unexported fields
}

ModelIterator iterates through all models by sequentually flipping bits

func (*ModelIterator) Flip

func (m *ModelIterator) Flip()

Flip flips current

func (*ModelIterator) Next

func (m *ModelIterator) Next() []int

Next returns the next model

func (*ModelIterator) UndoLastFlip

func (m *ModelIterator) UndoLastFlip()

UndoLastFlip undo the prvious flip

type OptimizeResult

type OptimizeResult struct {
	Score   float64
	Include []int
	Coeff   *mat.VecDense
}

OptimizeResult is returned by local optimization of the linear model

func OrthogonalMatchingPursuit

func OrthogonalMatchingPursuit(dataset Dataset, cost CostFunction, maxFeatures int) OptimizeResult

OrthogonalMatchingPursuit optimizes the cost function by selecting the model that leads to the largest decrease in the cost function

func (*OptimizeResult) IsEqual

func (or *OptimizeResult) IsEqual(other OptimizeResult) bool

IsEqual returns ture if the two optimize results are equal

type Prediction

type Prediction struct {
	Value float64
	Std   float64
}

Prediction is a type that represent a prediction (the expected valud and the standard deviation)

func GetPredictions

func GetPredictions(data Dataset, model Model, predData *Dataset) []Prediction

GetPredictions together with the standard deviations for all data in predData. If predData is nil, data will be used (e.g. in sample prediction errors)

func ReadPredictions

func ReadPredictions(fname string) ([]Prediction, error)

ReadPredictions reads the predictions from a csv file (same as stored by SavePredictions)

func (Prediction) IsEqual

func (p Prediction) IsEqual(other Prediction) bool

IsEqual returns true of the two predictions are equal

type PredictionErrorFIC

type PredictionErrorFIC struct {
	Data []int
}

PredictionErrorFIC tries to select the model that has the highest precision for a subset of the data

func (*PredictionErrorFIC) Evaluate

func (pef *PredictionErrorFIC) Evaluate(X *mat.Dense, y *mat.VecDense, coeff *mat.VecDense) float64

Evaluate evaluates the focused information criteria

type Score

type Score struct {
	Name  string
	Value float64
}

Score is a conveniene type used to collect information about the quality of a model

type SubMatrix

type SubMatrix struct {
	X    mat.Matrix
	Rows int
	Cols int
}

SubMatrix creates a sub-view of a matrix. The view contains the upper left corner starting from element (0, 0) and ending at (Rows, Cols)

func (*SubMatrix) At

func (s *SubMatrix) At(i, j int) float64

At returns the value of element (i, j)

func (*SubMatrix) Dims

func (s *SubMatrix) Dims() (int, int)

Dims returns the dimansion of the matrix

func (*SubMatrix) T

func (s *SubMatrix) T() mat.Matrix

T returns the transpose of the matrix

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL