Documentation
¶
Index ¶
- Constants
- Variables
- func IgnoredForTopology(p *corev1.Pod) bool
- func InstanceTypeList(instanceTypeOptions []*cloudprovider.InstanceType) string
- func IsReservedOfferingError(err error) bool
- func NoCompatibleInstanceTypes(np *v1.NodePool) events.Event
- func NominatePodEvent(pod *corev1.Pod, node *corev1.Node, nodeClaim *v1.NodeClaim) events.Event
- func PodFailedToScheduleEvent(pod *corev1.Pod, err error) events.Event
- func TopologyListOptions(namespace string, labelSelector *metav1.LabelSelector) *client.ListOptions
- type ExistingNode
- type InstanceTypeFilterError
- type NodeClaim
- type NodeClaimTemplate
- type Options
- type PodData
- type Preferences
- type Queue
- type ReservationManager
- type ReservedOfferingError
- type ReservedOfferingMode
- type Results
- func (r Results) AllNonPendingPodsScheduled() bool
- func (r Results) NonPendingPodSchedulingErrors() string
- func (r Results) Record(ctx context.Context, recorder events.Recorder, cluster *state.Cluster)
- func (r Results) ReservedOfferingErrors() map[*corev1.Pod]error
- func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results
- type Scheduler
- type Topology
- func (t *Topology) AddRequirements(p *corev1.Pod, taints []corev1.Taint, ...) (scheduling.Requirements, error)
- func (t *Topology) Record(p *corev1.Pod, taints []corev1.Taint, requirements scheduling.Requirements, ...)
- func (t *Topology) Register(topologyKey string, domain string)
- func (t *Topology) Unregister(topologyKey string, domain string)
- func (t *Topology) Update(ctx context.Context, p *corev1.Pod) error
- type TopologyDomainGroup
- type TopologyGroup
- func (t *TopologyGroup) AddOwner(key types.UID)
- func (t *TopologyGroup) Counts(pod *corev1.Pod, taints []corev1.Taint, requirements scheduling.Requirements, ...) bool
- func (t *TopologyGroup) Get(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement
- func (t *TopologyGroup) Hash() uint64
- func (t *TopologyGroup) IsOwnedBy(key types.UID) bool
- func (t *TopologyGroup) Record(domains ...string)
- func (t *TopologyGroup) Register(domains ...string)
- func (t *TopologyGroup) RemoveOwner(key types.UID)
- func (t *TopologyGroup) Unregister(domains ...string)
- type TopologyNodeFilter
- type TopologyType
- type VolumeTopology
Constants ¶
const (
ControllerLabel = "controller"
)
Variables ¶
var ( DurationSeconds = opmetrics.NewPrometheusHistogram( crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, Name: "scheduling_duration_seconds", Help: "Duration of scheduling simulations used for deprovisioning and provisioning in seconds.", Buckets: metrics.DurationBuckets(), }, []string{ ControllerLabel, }, ) QueueDepth = opmetrics.NewPrometheusGauge( crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, Name: "queue_depth", Help: "The number of pods currently waiting to be scheduled.", }, []string{ ControllerLabel, schedulingIDLabel, }, ) UnfinishedWorkSeconds = opmetrics.NewPrometheusGauge( crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, Name: "unfinished_work_seconds", Help: "How many seconds of work has been done that is in progress and hasn't been observed by scheduling_duration_seconds.", }, []string{ ControllerLabel, schedulingIDLabel, }, ) IgnoredPodCount = opmetrics.NewPrometheusGauge( crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, Name: "ignored_pods_count", Help: "Number of pods ignored during scheduling by Karpenter", }, []string{}, ) UnschedulablePodsCount = opmetrics.NewPrometheusGauge( crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, Name: "unschedulable_pods_count", Help: "The number of unschedulable Pods.", }, []string{ ControllerLabel, }, ) )
var DisableReservedCapacityFallback = func(opts *options) { opts.reservedOfferingMode = ReservedOfferingModeStrict }
var MaxInstanceTypes = 60
MaxInstanceTypes is a constant that restricts the number of instance types to be sent for launch. Note that this is intentionally changed to var just to help in testing the code.
var PodNominationRateLimiter = flowcontrol.NewTokenBucketRateLimiter(5, 10)
PodNominationRateLimiter is a pointer so it rate-limits across events
Functions ¶
func IgnoredForTopology ¶
func InstanceTypeList ¶
func InstanceTypeList(instanceTypeOptions []*cloudprovider.InstanceType) string
func IsReservedOfferingError ¶ added in v1.2.1
func NoCompatibleInstanceTypes ¶ added in v1.2.0
func NominatePodEvent ¶
func TopologyListOptions ¶
func TopologyListOptions(namespace string, labelSelector *metav1.LabelSelector) *client.ListOptions
Types ¶
type ExistingNode ¶
type ExistingNode struct { *state.StateNode Pods []*v1.Pod // contains filtered or unexported fields }
func NewExistingNode ¶
func NewExistingNode(n *state.StateNode, topology *Topology, taints []v1.Taint, daemonResources v1.ResourceList) *ExistingNode
type InstanceTypeFilterError ¶ added in v1.2.1
type InstanceTypeFilterError struct {
// contains filtered or unexported fields
}
func (InstanceTypeFilterError) Error ¶ added in v1.2.1
func (e InstanceTypeFilterError) Error() string
type NodeClaim ¶
type NodeClaim struct { NodeClaimTemplate Pods []*corev1.Pod // contains filtered or unexported fields }
NodeClaim is a set of constraints, compatible pods, and possible instance types that could fulfill these constraints. This will be turned into one or more actual node instances within the cluster after bin packing.
func NewNodeClaim ¶
func NewNodeClaim( nodeClaimTemplate *NodeClaimTemplate, topology *Topology, daemonResources corev1.ResourceList, instanceTypes []*cloudprovider.InstanceType, reservationManager *ReservationManager, reservedOfferingMode ReservedOfferingMode, ) *NodeClaim
func (*NodeClaim) FinalizeScheduling ¶
func (n *NodeClaim) FinalizeScheduling()
FinalizeScheduling is called once all scheduling has completed and allows the node to perform any cleanup necessary before its requirements are used for instance launching
func (*NodeClaim) RemoveInstanceTypeOptionsByPriceAndMinValues ¶ added in v1.0.0
func (n *NodeClaim) RemoveInstanceTypeOptionsByPriceAndMinValues(reqs scheduling.Requirements, maxPrice float64) (*NodeClaim, error)
type NodeClaimTemplate ¶
type NodeClaimTemplate struct { v1.NodeClaim NodePoolName string NodePoolUUID types.UID NodePoolWeight int32 InstanceTypeOptions cloudprovider.InstanceTypes Requirements scheduling.Requirements }
NodeClaimTemplate encapsulates the fields required to create a node and mirrors the fields in NodePool. These structs are maintained separately in order for fields like Requirements to be able to be stored more efficiently.
func NewNodeClaimTemplate ¶
func NewNodeClaimTemplate(nodePool *v1.NodePool) *NodeClaimTemplate
func (*NodeClaimTemplate) ToNodeClaim ¶
func (i *NodeClaimTemplate) ToNodeClaim() *v1.NodeClaim
type PodData ¶ added in v1.2.1
type PodData struct { Requests corev1.ResourceList Requirements scheduling.Requirements StrictRequirements scheduling.Requirements }
type Preferences ¶
type Preferences struct { // ToleratePreferNoSchedule controls if preference relaxation adds a toleration for PreferNoSchedule taints. This only // helps if there is a corresponding taint, so we don't always add it. ToleratePreferNoSchedule bool }
type Queue ¶
type Queue struct {
// contains filtered or unexported fields
}
Queue is a queue of pods that is scheduled. It's used to attempt to schedule pods as long as we are making progress in scheduling. This is sometimes required to maintain zonal topology spreads with constrained pods, and can satisfy pod affinities that occur in a batch of pods if there are enough constraints provided.
func NewQueue ¶
NewQueue constructs a new queue given the input pods, sorting them to optimize for bin-packing into nodes.
type ReservationManager ¶ added in v1.2.1
type ReservationManager struct {
// contains filtered or unexported fields
}
func NewReservationManager ¶ added in v1.2.1
func NewReservationManager(instanceTypes map[string][]*cloudprovider.InstanceType) *ReservationManager
func (*ReservationManager) Release ¶ added in v1.2.1
func (rm *ReservationManager) Release(hostname string, offerings ...*cloudprovider.Offering)
func (*ReservationManager) Reserve ¶ added in v1.2.1
func (rm *ReservationManager) Reserve(hostname string, offering *cloudprovider.Offering) bool
type ReservedOfferingError ¶ added in v1.2.1
type ReservedOfferingError struct {
// contains filtered or unexported fields
}
ReservedOfferingError indicates a NodeClaim couldn't be created or a pod couldn't be added to an exxisting NodeClaim due to
func NewReservedOfferingError ¶ added in v1.2.1
func NewReservedOfferingError(err error) ReservedOfferingError
func (ReservedOfferingError) Unwrap ¶ added in v1.2.1
func (e ReservedOfferingError) Unwrap() error
type ReservedOfferingMode ¶ added in v1.2.1
type ReservedOfferingMode int
const ( // ReservedOfferingModeFallbackAlways indicates to the scheduler that the addition of a pod to a nodeclaim which // results in all potential reserved offerings being filtered out is allowed (e.g. on-demand / spot fallback). ReservedOfferingModeFallback ReservedOfferingMode = iota // ReservedOfferingModeStrict indicates that the scheduler should fail to add a pod to a nodeclaim if doing so would // prevent it from scheduling to reserved capacity, when it would have otherwise. ReservedOfferingModeStrict )
TODO: Evaluate if another mode should be created for drift. The problem with strict is that it assumes we can run multiple scheduling loops to make progress, but if scheduling all pods from the drifted node in a single iteration requires fallback, we're at a stalemate. This makes strict a non-starter for drift IMO. On the other hand, fallback will result in non-ideal launches when there's constrained capacity. This should be rectified by consolidation, but if we can be "right" the at initial launch that would be preferable. One potential improvement is a "preferences" type strategy, where we attempt to schedule the pod without fallback first. This is an improvement over the current fallback strategy since it ensures all new nodeclaims are attempted, before then attempting all nodepools, but it still doesn't address the case when offerings are reserved pessimistically. I don't believe there's a solution to this short of the max-flow based instance selection algorithm, which has its own drawbacks.
type Results ¶
type Results struct { NewNodeClaims []*NodeClaim ExistingNodes []*ExistingNode PodErrors map[*corev1.Pod]error }
Results contains the results of the scheduling operation
func (Results) AllNonPendingPodsScheduled ¶
AllNonPendingPodsScheduled returns true if all pods scheduled. We don't care if a pod was pending before consolidation and will still be pending after. It may be a pod that we can't schedule at all and don't want it to block consolidation.
func (Results) NonPendingPodSchedulingErrors ¶
NonPendingPodSchedulingErrors creates a string that describes why pods wouldn't schedule that is suitable for presentation
func (Results) Record ¶ added in v0.35.0
Record sends eventing and log messages back for the results that were produced from a scheduling run It also nominates nodes in the cluster state based on the scheduling run to signal to other components leveraging the cluster state that a previous scheduling run that was recorded is relying on these nodes
func (Results) ReservedOfferingErrors ¶ added in v1.2.1
func (Results) TruncateInstanceTypes ¶ added in v0.35.0
TruncateInstanceTypes filters the result based on the maximum number of instanceTypes that needs to be considered. This filters all instance types generated in NewNodeClaims in the Results
type Scheduler ¶
type Scheduler struct {
// contains filtered or unexported fields
}
func NewScheduler ¶
func NewScheduler( ctx context.Context, kubeClient client.Client, nodePools []*v1.NodePool, cluster *state.Cluster, stateNodes []*state.StateNode, topology *Topology, instanceTypes map[string][]*cloudprovider.InstanceType, daemonSetPods []*corev1.Pod, recorder events.Recorder, clock clock.Clock, opts ...Options, ) *Scheduler
type Topology ¶
type Topology struct {
// contains filtered or unexported fields
}
func NewTopology ¶
func (*Topology) AddRequirements ¶
func (t *Topology) AddRequirements(p *corev1.Pod, taints []corev1.Taint, podRequirements, nodeRequirements scheduling.Requirements, compatabilityOptions ...option.Function[scheduling.CompatibilityOptions]) (scheduling.Requirements, error)
AddRequirements tightens the input requirements by adding additional requirements that are being enforced by topology spreads affinities, anti-affinities or inverse anti-affinities. The nodeHostname is the hostname that we are currently considering placing the pod on. It returns these newly tightened requirements, or an error in the case of a set of requirements that cannot be satisfied.
func (*Topology) Record ¶
func (t *Topology) Record(p *corev1.Pod, taints []corev1.Taint, requirements scheduling.Requirements, compatabilityOptions ...option.Function[scheduling.CompatibilityOptions])
Record records the topology changes given that pod p schedule on a node with the given requirements
func (*Topology) Register ¶
Register is used to register a domain as available across topologies for the given topology key.
func (*Topology) Unregister ¶ added in v1.1.0
Unregister is used to unregister a domain as available across topologies for the given topology key.
func (*Topology) Update ¶
Update unregisters the pod as the owner of all affinities and then creates any new topologies based on the pod spec registered the pod as the owner of all associated affinities, new or old. This allows Update() to be called after relaxation of a preference to properly break the topology <-> owner relationship so that the preferred topology will no longer influence scheduling.
type TopologyDomainGroup ¶ added in v1.2.1
TopologyDomainGroup tracks the domains for a single topology. Additionally, it tracks the taints associated with each of these domains. This enables us to determine which domains should be considered by a pod if its NodeTaintPolicy is honor.
func NewTopologyDomainGroup ¶ added in v1.2.1
func NewTopologyDomainGroup() TopologyDomainGroup
func (TopologyDomainGroup) ForEachDomain ¶ added in v1.2.1
func (t TopologyDomainGroup) ForEachDomain(pod *v1.Pod, taintHonorPolicy v1.NodeInclusionPolicy, f func(domain string))
ForEachDomain calls f on each domain tracked by the topology group. If the taintHonorPolicy is honor, only domains available on nodes tolerated by the provided pod will be included.
type TopologyGroup ¶
type TopologyGroup struct { // Hashed Fields Key string Type TopologyType // contains filtered or unexported fields }
TopologyGroup is used to track pod counts that match a selector by the topology domain (e.g. SELECT COUNT(*) FROM pods GROUP BY(topology_ke
func NewTopologyGroup ¶
func NewTopologyGroup( topologyType TopologyType, topologyKey string, pod *corev1.Pod, namespaces sets.Set[string], labelSelector *metav1.LabelSelector, maxSkew int32, minDomains *int32, taintPolicy *corev1.NodeInclusionPolicy, affinityPolicy *corev1.NodeInclusionPolicy, domainGroup TopologyDomainGroup, ) *TopologyGroup
func (*TopologyGroup) AddOwner ¶
func (t *TopologyGroup) AddOwner(key types.UID)
func (*TopologyGroup) Counts ¶
func (t *TopologyGroup) Counts(pod *corev1.Pod, taints []corev1.Taint, requirements scheduling.Requirements, compatabilityOptions ...option.Function[scheduling.CompatibilityOptions]) bool
Counts returns true if the pod would count for the topology, given that it schedule to a node with the provided requirements
func (*TopologyGroup) Get ¶
func (t *TopologyGroup) Get(pod *corev1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement
func (*TopologyGroup) Hash ¶
func (t *TopologyGroup) Hash() uint64
Hash is used so we can track single topologies that affect multiple groups of pods. If a deployment has 100x pods with self anti-affinity, we track that as a single topology with 100 owners instead of 100x topologies.
func (*TopologyGroup) Record ¶
func (t *TopologyGroup) Record(domains ...string)
func (*TopologyGroup) Register ¶
func (t *TopologyGroup) Register(domains ...string)
Register ensures that the topology is aware of the given domain names.
func (*TopologyGroup) RemoveOwner ¶
func (t *TopologyGroup) RemoveOwner(key types.UID)
func (*TopologyGroup) Unregister ¶ added in v1.1.0
func (t *TopologyGroup) Unregister(domains ...string)
type TopologyNodeFilter ¶
type TopologyNodeFilter struct { Requirements []scheduling.Requirements TaintPolicy corev1.NodeInclusionPolicy AffinityPolicy corev1.NodeInclusionPolicy Tolerations []corev1.Toleration }
TopologyNodeFilter is used to determine if a given actual node or scheduling node matches the pod's node selectors and required node affinity terms. This is used with topology spread constraints to determine if the node should be included for topology counting purposes. This is only used with topology spread constraints as affinities/anti-affinities always count across all nodes. A nil or zero-value TopologyNodeFilter behaves well and the filter returns true for all nodes.
func MakeTopologyNodeFilter ¶
func MakeTopologyNodeFilter(p *corev1.Pod, taintPolicy corev1.NodeInclusionPolicy, affinityPolicy corev1.NodeInclusionPolicy) TopologyNodeFilter
func (TopologyNodeFilter) Matches ¶
func (t TopologyNodeFilter) Matches(taints []corev1.Taint, requirements scheduling.Requirements, compatibilityOptions ...option.Function[scheduling.CompatibilityOptions]) bool
Matches returns true if the TopologyNodeFilter doesn't prohibit node from the participating in the topology
type TopologyType ¶
type TopologyType byte
const ( TopologyTypeSpread TopologyType = iota TopologyTypePodAffinity TopologyTypePodAntiAffinity )
func (TopologyType) String ¶
func (t TopologyType) String() string
type VolumeTopology ¶
type VolumeTopology struct {
// contains filtered or unexported fields
}
func NewVolumeTopology ¶
func NewVolumeTopology(kubeClient client.Client) *VolumeTopology
func (*VolumeTopology) ValidatePersistentVolumeClaims ¶
ValidatePersistentVolumeClaims returns an error if the pod doesn't appear to be valid with respect to PVCs (e.g. the PVC is not found or references an unknown storage class).