optimizers

package

v0.23.0 Latest Latest Go to latest Published: Sep 21, 2025 License: Apache-2.0 Imports: 8 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/gomlx/gomlx

Links

Open Source Insights

Documentation ¶

Overview ¶

Package optimizers implements a collection of ML optimizers that can be used by train.Trainer, or by themselves. They all implement optimizers.Interface.

Index ¶

Constants
Variables
func ClipNaNsInGradients(ctx *context.Context, gradients *Node) *Node
func ClipNaNsInUpdates(ctx *context.Context, original, updates *Node) *Node
func ClipStepByValue(ctx *context.Context, step *Node) *Node
func DeleteGlobalStep(ctx *context.Context)
func GetGlobalStep(ctx *context.Context) int64
func GetGlobalStepVar(ctx *context.Context) *context.Variable
func IncrementGlobalStepGraph(ctx *context.Context, g *Graph, dtype dtypes.DType) *Node
func LearningRateVar(ctx *context.Context, dtype dtypes.DType, initialValue float64) *context.Variable
func LearningRateVarWithValue(ctx *context.Context, dtype dtypes.DType, value float64) *context.Variable
func MonotonicProjection(input *Node, margin *Node, axis int) *Node
func TraceNaNInGradients(ctx *context.Context, variable *context.Variable, gradients *Node)
type AdamConfig
- func Adam() *AdamConfig
- func RMSProp() *AdamConfig
- func (c *AdamConfig) Adamax() *AdamConfig
- func (c *AdamConfig) Betas(beta1, beta2 float64) *AdamConfig
- func (c *AdamConfig) DType(dtype dtypes.DType) *AdamConfig
- func (c *AdamConfig) Done() Interface
- func (c *AdamConfig) Epsilon(epsilon float64) *AdamConfig
- func (c *AdamConfig) FromContext(ctx *context.Context) *AdamConfig
- func (c *AdamConfig) LearningRate(value float64) *AdamConfig
- func (c *AdamConfig) Scope(name string) *AdamConfig
- func (c *AdamConfig) WeightDecay(weightDecay float64) *AdamConfig
- func (c *AdamConfig) WithBackoffSteps(numSteps int) *AdamConfig
type Interface
- func ByName(ctx *context.Context, optName string) Interface
- func FromContext(ctx *context.Context) Interface
type SGDConfig
- func StochasticGradientDescent() *SGDConfig
- func (sgd *SGDConfig) Clear(_ *context.Context)
- func (sgd *SGDConfig) Done() Interface
- func (sgd *SGDConfig) UpdateGraph(ctx *context.Context, g *Graph, loss *Node)
- func (sgd *SGDConfig) UpdateGraphWithGradients(ctx *context.Context, grads []*Node, lossDType dtypes.DType)
- func (sgd *SGDConfig) WithDecay(enabled bool) *SGDConfig
- func (sgd *SGDConfig) WithLearningRate(initialLearningRate float64) *SGDConfig
type Tracer

Constants ¶

View Source

const (
	// AdamDefaultLearningRate is used by Adam if no learning rate is set.
	AdamDefaultLearningRate = 0.001

	// AdamDefaultScope is the default scope name for moments and step used by Adam.
	AdamDefaultScope = "AdamOptimizer"

	// ParamAdamEpsilon can be used to configure the default value of epsilon. It must be a float64.
	ParamAdamEpsilon = "adam_epsilon"

	// ParamAdamDType can be used to specify the dtype to be used by Adam's temporary variables and computations.
	// The default or if set to empty is to use the same dtype as the value of the loss provided.
	// This was created for the case of training with `float16` or `bfloat16`, which is not enough resolution
	// for Adam calculations.
	// Valid values: "" (empty), "float32", "float64".
	ParamAdamDType = "adam_dtype"

	// ParamAdamWeightDecay defaults to 0.0. See AdamConfig.WeightDecay.
	ParamAdamWeightDecay = "adam_weight_decay"

	// ParamAdamBeta1 is the moving average coefficient for the gradient (momentum), the numerator.
	// The default value is 0.9
	ParamAdamBeta1 = "adam_beta1"

	// ParamAdamBeta2 is the moving average coefficient for the variance, the denominator.
	// The default value is 0.999
	ParamAdamBeta2 = "adam_beta2"

	// ParamAdamBackoffSteps default to 0. Values > 0 prevents any gradient steps to be taken
	// for those many steps, to allow a better estimate of the momentum and variance.
	// See AdamConfig.WithBackoffSteps.
	ParamAdamBackoffSteps = "adam_backoff"
)

View Source

const (
	// GlobalStepVariableName as stored in context.Context, usually in the root scope -- but depends on the
	// caller.
	GlobalStepVariableName = "global_step"

	// Scope reserved for optimizers.
	Scope = "optimizers"
)

View Source

const SGDDefaultLearningRate = 0.1

SGDDefaultLearningRate is the default learning rate used by the StochasticGradientDescent optimizer.

Variables ¶

View Source

var (
	// KnownOptimizers is a map of known optimizers by name to their default constructors.
	// This provides an easy quick start point. One can hyperparameter-tune the optimizers
	// for usually slightly better results.
	KnownOptimizers = map[string]func(ctx *context.Context) Interface{
		"sgd":     func(ctx *context.Context) Interface { return StochasticGradientDescent() },
		"adam":    func(ctx *context.Context) Interface { return Adam().FromContext(ctx).Done() },
		"adamax":  func(ctx *context.Context) Interface { return Adam().Adamax().FromContext(ctx).Done() },
		"adamw":   func(ctx *context.Context) Interface { return Adam().WeightDecay(0.004).FromContext(ctx).Done() },
		"rmsprop": func(ctx *context.Context) Interface { return RMSProp().FromContext(ctx).Done() },
	}

	// ParamOptimizer is the context parameter with the name of the optimizer.
	// The default value is "adamw", and the valid values are "sgd", "adam", "adamw" and "adamax".
	ParamOptimizer = "optimizer"

	// ParamLearningRate is the context parameter name for the default value of learning rate.
	// It is used by most (all?) optimizers.
	ParamLearningRate = "learning_rate"

	// LearningRateKey is an alias to ParamLearningRate
	//
	// Deprecated: use ParamLearningRate instead.
	LearningRateKey = ParamLearningRate

	// ParamClipStepByValue is a clip scalar value for each individual value of the gradient step, after
	// being scaled by the learning rate and optimizer.
	// The step applied will be `ClipScalar(step, -clip_step_by_value, +clip_step_by_value)`.
	// Defaults to no clipping, and values are expected to be float64.
	ParamClipStepByValue = "clip_step_by_value"

	// ParamClipNaN will drop any updates with NaNs.
	// This is a double-edged option: it keeps training running, but probably it will replace NaNs with bad training results.
	// It works well to handle spurious results.
	//
	// See also ParamNanLogger to help debug it.
	//
	// The default is false.
	ParamClipNaN = "clip_nan"

	// ParamNanLogger configures a nanlogger to use to report NaNs in gradients updates for example. See TraceNaNInGradients.
	// This value is not saved in a checkpoint.
	// It should be set to a Tracer (which a *nanlogger.NanLogger is).
	//
	// Typical use:
	//
	//	var nanLogger *nanlogger.NanLogger
	//	if debugNaNs {
	//		nanLogger = nanlogger.New()
	//		ctx.SetParam(optimizers.ParamNanLogger, nanLogger)
	//	}
	//	trainer := train.NewTrainer(…)
	//	nanLogger.AttachToTrainer(trainer)
	ParamNanLogger = "nanlogger"
)

Functions ¶

func ClipNaNsInGradients ¶ added in v0.17.0

func ClipNaNsInGradients(ctx *context.Context, gradients *Node) *Node

ClipNaNsInGradients will replace the gradient tensor by zeros if there are any NaNs or +/-Inf values. It is only enabled if ParamClipNaN is set to true.

func ClipNaNsInUpdates ¶ added in v0.13.0

func ClipNaNsInUpdates(ctx *context.Context, original, updates *Node) *Node

ClipNaNsInUpdates will replace original values into updates, where updates have NaN (or +/-Inf) values, if the ParamClipNaN is set to true.

func ClipStepByValue ¶ added in v0.10.0

func ClipStepByValue(ctx *context.Context, step *Node) *Node

ClipStepByValue applies the ParamClipStepByValue hyperparameter if it is not 0.0 (the default).

func DeleteGlobalStep ¶ added in v0.8.0

func DeleteGlobalStep(ctx *context.Context)

DeleteGlobalStep in case one wants to reset the model state, or hide how many steps were taken.

func GetGlobalStep ¶ added in v0.8.0

func GetGlobalStep(ctx *context.Context) int64

GetGlobalStep returns the current global step value. It creates the global step variable if it does not yet exist.

func GetGlobalStepVar ¶ added in v0.4.0

func GetGlobalStepVar(ctx *context.Context) *context.Variable

GetGlobalStepVar returns the global step counter, a dtypes.Int64 variable. It creates it (initialized with 0) if not already there. This can be used in graph building or directly.

func IncrementGlobalStepGraph ¶

func IncrementGlobalStepGraph(ctx *context.Context, g *Graph, dtype dtypes.DType) *Node

IncrementGlobalStepGraph creates (if not there yet) a global step counter, and returns it incremented -- its first returned value will be 1.

It only builds the computation graph, no actual values are generated.

Typically, this is called by the optimizers UpdateGraph method.

GlobalStep is always stored as dtypes.Int64, but it is converted to the given DType before being returned.

func LearningRateVar ¶

func LearningRateVar(ctx *context.Context, dtype dtypes.DType, initialValue float64) *context.Variable

LearningRateVar returns the learning rate variable -- a scalar value of the given dtype.

If the variable doesn't exist yet, it is initialized with initialValue.

Consider reading the initialValue from context.GetParamOr(ctx, ParamLearningRate, SGDDefaultLearningRate).

func LearningRateVarWithValue ¶

func LearningRateVarWithValue(ctx *context.Context, dtype dtypes.DType, value float64) *context.Variable

LearningRateVarWithValue creates (or reuses) variable for learning rate with the given value.

func MonotonicProjection ¶ added in v0.13.0

func MonotonicProjection(input *Node, margin *Node, axis int) *Node

MonotonicProjection transforms the input into a monotonic sequence on the given axis that respects the minimum margin between consecutive points.

Here we call "viable" solution, one that respects the given margin between consecutive points. And the goal is to find the viable solution that is L2-closest to the original input -- we don't achieve that, but some approximate that is hopefully good enough for most algorithms.

This is not a trivial problem, as adjustments to one point may break the monotonicity of the next, and so on. A close to optimal approximate solution can be achieved using lagrange multipliers (and Dykstra alternate projections), see implementation in TensorFlow Lattice: https://github.com/tensorflow/lattice/blob/master/tensorflow_lattice/python/pwl_calibration_lib.py#L472

Unfortunately, GoMLX doesn't support "while" loops in the computation graph yet, so instead we make a coarse but simple projection to the viable space using a simple algorithm -- see code.

The usual way to use this is inside a call to train.AddPerStepUpdateGraphFn, making the projection happen after the gradient step.

func TraceNaNInGradients ¶ added in v0.21.0

func TraceNaNInGradients(ctx *context.Context, variable *context.Variable, gradients *Node)

TraceNaNInGradients will report a NaN/Inf value in a gradient for the given variable, is a "Tracer" (typically a nanlogger.NanLogger) has been configured in the context.

Types ¶

type AdamConfig ¶

type AdamConfig struct {
	// contains filtered or unexported fields
}

AdamConfig holds the configuration for an Adam configuration, create using Adam(), and once configured call Done to create an Adam-based optimizer.Interface.

func Adam ¶

func Adam() *AdamConfig

Adam optimization is a stochastic gradient descent method based on an adaptive estimation of first-order and second-order moments. According to [Kingma et al., 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally efficient, has little memory requirement, invariant to diagonal rescaling of gradients, and is well suited for problems that are large in terms of data/parameters*".

It returns a configuration object that can be used to set its parameters. Once configured, call AdamConfig.Done, and it will return an optimizer.Interface that can be used with the `train.Trainer` or directly in a custom optimization loop.

See AdamConfig.FromContext to configure it from the context hyperparameters.

Clipping of the gradient updates available by setting the context hyperparameters ParamClipStepByValue("clip_step_by_value") and ParamClipNaN ("clip_nan"). NaN in gradients can be reported by assigning a `nanlogger.NanLogger` to the parameter ParamNanLogger.

func RMSProp ¶ added in v0.21.0

func RMSProp() *AdamConfig

RMSProp is an optimizer that divides the learning rate for a weight by a running average of the recent gradients magnitudes (L2) for that weight.

It uses Adam to implement it -- it's somewhat equivalent to an Adam without the 1st moment of the gradients.

It was described first in the following sources: * https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf (Hinton) * https://arxiv.org/pdf/1308.0850 (Graves)

It returns a configuration object that can be used to set its parameters. Once configured, call AdamConfig.Done, and it will return an optimizer.Interface that can be used with the `train.Trainer` or directly in a custom optimization loop.

Clipping of the gradient updates available by setting the context hyperparameters ParamClipStepByValue("clip_step_by_value") and ParamClipNaN ("clip_nan"). NaN in gradients can be reported by assigning a `nanlogger.NanLogger` to the parameter ParamNanLogger.

func (*AdamConfig) Adamax ¶

func (c *AdamConfig) Adamax() *AdamConfig

Adamax configure Adam to use an L-infinity (== max, which gives the name) for the second moment, instead of L2, as described in the same Adam paper.

func (*AdamConfig) Betas ¶

func (c *AdamConfig) Betas(beta1, beta2 float64) *AdamConfig

Betas set the two moving averages constants (exponential decays). They default to 0.9 and 0.999.

The first is for the gradient momentum (the numerator of the step taken), and the second is for the variance of the gradients (denominator).

func (*AdamConfig) DType ¶ added in v0.10.0

func (c *AdamConfig) DType(dtype dtypes.DType) *AdamConfig

DType sets the dtype to use for Adam calculation and temporary variables. This can be useful if training using `float16`, which is not enough resolution for Adam calculations in some cases.

If set to `shapes.InvalidDType` it will use the dtype of the `loss` used to optimize.

This can also be set from context using ParamAdamDType("adam_dtype") hyperparameter.

func (*AdamConfig) Done ¶

func (c *AdamConfig) Done() Interface

Done will finish the configuration and construct an optimizer.Interface that implements Adam to specification.

func (*AdamConfig) Epsilon ¶

func (c *AdamConfig) Epsilon(epsilon float64) *AdamConfig

Epsilon used on the denominator as a small constant for stability. For low-precision numbers like float16, try a larger value here, like 1e-3.

func (*AdamConfig) FromContext ¶ added in v0.10.0

func (c *AdamConfig) FromContext(ctx *context.Context) *AdamConfig

FromContext will configure Adam with hyperparameters set in the given context. E.g.: "adam_epsilon" (see ParamAdamEpsilon) is used to set AdamConfig.Epsilon.

func (*AdamConfig) LearningRate ¶

func (c *AdamConfig) LearningRate(value float64) *AdamConfig

LearningRate sets the base learning rate as a floating point value -- eventually converted to the same dtype as the loss.

Default is either the value of ParamLearningRate ("learning_rate") global parameter in Context if defined, or 0.001 if not.

func (*AdamConfig) Scope ¶

func (c *AdamConfig) Scope(name string) *AdamConfig

Scope defines the top-level scope to use to store the 1st and 2nd order moments of the gradients and the step number used by Adam optimizer. Generally this doesn't need to be changed, but if one is using multiple schedules, potentially with different loss functions (so the moments should be different), one can change.

It defaults to AdamDefaultScope.

func (*AdamConfig) WeightDecay ¶

func (c *AdamConfig) WeightDecay(weightDecay float64) *AdamConfig

WeightDecay configure optimizer to work as AdamW, with the given static weight decay. This is because L2 regularization doesn't work well with Adam.

Defaults to the value given in the AdamWeightDecay hyperparameter.

TODO: (1) Allow certain variables to be excluded from weight decay (e.g: biases); (2) Allow dynamically calculated weight decay.

func (*AdamConfig) WithBackoffSteps ¶ added in v0.21.1

func (c *AdamConfig) WithBackoffSteps(numSteps int) *AdamConfig

WithBackoffSteps prevents any gradient steps to be taken, until numSteps steps have been taken to allow for a better estimate of the gradient momentums (numerator) and variance of gradients (denominator) before the optimization start.

If set to <= 0, no backoff is configured.

The default is 0, or the value with

type Interface ¶

type Interface interface {
	// UpdateGraph is the function called during computation graph building, it
	// calculates the updates to the variables (weights) of the model needed for one
	// training step.
	// It should return these updates.
	//
	// Variable values can be updated in graph building time (inside UpdateGraph) using Variable.SetValueGraph,
	// and the trainer (train.Trainer) will make sure these values are returned from the graph execution
	// and the materialized values used to update the variables (Variable.SetValue).
	//
	// The ctx holds the variables to train (marked as trainable), the hyperparameters
	// used by the optimizer (in `ctx.Params`) and non-trainable variables
	// that the optimizer itself may create. One should scope it (context.Context.In("<some scope name>"))
	// to avoid naming conflicts on the variables created -- notice that
	// some complex training schedule may have more than one optimizer on the same Context object.
	//
	// loss must be a scalar value.
	UpdateGraph(ctx *context.Context, g *Graph, loss *Node)

	// Clear deletes all temporary variables used by the optimizer.
	// This may be used for a model to be used by inference to save space, or if the training should be reset
	// for some other reason.
	Clear(ctx *context.Context)
}

Interface implemented by optimizer implementations.

Optionally, an optimizer may also implement the interface trainer.OptimizerWithGradients to allow for updates from accumulated gradients.

func ByName ¶ added in v0.11.0

func ByName(ctx *context.Context, optName string) Interface

ByName returns an optimizer given the name, or panics if one does not exist. It uses KnownOptimizers -- in case one wants to better handle invalid values.

Some optimizers (e.g.: Adam) uses optional hyperparameters set in the context for configuration.

func FromContext ¶ added in v0.9.0

func FromContext(ctx *context.Context) Interface

FromContext creates an optimizer from context hyperparameters. See ParamOptimizer. The default is "adamw".

type SGDConfig ¶ added in v0.20.1

type SGDConfig struct {
	// contains filtered or unexported fields
}

SGDConfig implements a Stochastic Gradient Descent optimizer.

func StochasticGradientDescent ¶

func StochasticGradientDescent() *SGDConfig

StochasticGradientDescent creates an optimizer that performs SGD. It looks for "learning_rate" in Context.Params for the initial learning rate, otherwise it defaults to SGDDefaultLearningRate.

By default, it has a learning rate decay given by: `learning_rate = initial_learning_rate / Sqrt(global_step)`

func (*SGDConfig) Clear ¶ added in v0.20.1

func (sgd *SGDConfig) Clear(_ *context.Context)

Clear all optimizer variables. There are none for sgd, so this is a non-op. It implements optimizers.Interface.

func (*SGDConfig) Done ¶ added in v0.20.1

func (sgd *SGDConfig) Done() Interface

Done returns an optimizer.Interface. It's a no-op since SGDConfig is itself implements optimizer.Interface, but it keeps it consistent with the builder pattern, and the returned Interface is no longer configurable.

func (*SGDConfig) UpdateGraph ¶ added in v0.20.1

func (sgd *SGDConfig) UpdateGraph(ctx *context.Context, g *Graph, loss *Node)

UpdateGraph builds the graph to update the weights for one training step. It implements optimizers.Interface.

func (*SGDConfig) UpdateGraphWithGradients ¶ added in v0.20.1

func (sgd *SGDConfig) UpdateGraphWithGradients(ctx *context.Context, grads []*Node, lossDType dtypes.DType)

func (*SGDConfig) WithDecay ¶ added in v0.20.1

func (sgd *SGDConfig) WithDecay(enabled bool) *SGDConfig

WithDecay sets whether to use a learning rate decay with the global step.

It is enabled by default, but tests may want to disable it.

It returns itself to allow chaining.

func (*SGDConfig) WithLearningRate ¶ added in v0.20.1

func (sgd *SGDConfig) WithLearningRate(initialLearningRate float64) *SGDConfig

WithLearningRate sets the initial learning rate. The default value is SGDDefaultLearningRate.

It returns itself to allow chaining.

type Tracer ¶ added in v0.21.0

type Tracer interface {
	Trace(node *Node, scopes ...string)
}

Tracer can trace a node with a scope. Used to represent a nanlogger.NanLogger.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cosineschedule Package cosineschedule cosine annealing schedule for the learning rate.	Package cosineschedule cosine annealing schedule for the learning rate.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL