Add nvidia-plugin and node-feature-discovery to cluster setup.
This commit is contained in:
40
setup/cluster-services/node-feature-discovery/install.sh
Executable file
40
setup/cluster-services/node-feature-discovery/install.sh
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
|
||||
NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
|
||||
|
||||
print_header "Setting up Node Feature Discovery"
|
||||
|
||||
# Templates should already be compiled by wild-cluster-services-generate
|
||||
info "Using pre-compiled Node Feature Discovery templates..."
|
||||
if [ ! -d "${NFD_DIR}/kustomize" ]; then
|
||||
error "Compiled templates not found. Run 'wild-cluster-services-configure node-feature-discovery' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
info "Deploying Node Feature Discovery..."
|
||||
kubectl apply -k "${NFD_DIR}/kustomize"
|
||||
|
||||
info "Waiting for Node Feature Discovery DaemonSet to be ready..."
|
||||
kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
|
||||
|
||||
success "Node Feature Discovery installed successfully"
|
||||
|
||||
echo ""
|
||||
echo "To verify the installation:"
|
||||
echo " kubectl get pods -n node-feature-discovery"
|
||||
echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
|
||||
echo ""
|
||||
echo "GPU nodes should now be labeled with GPU device information:"
|
||||
echo " kubectl get nodes --show-labels | grep pci-10de"
|
||||
@@ -0,0 +1,711 @@
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeatures.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeature
|
||||
listKind: NodeFeatureList
|
||||
plural: nodefeatures
|
||||
singular: nodefeature
|
||||
scope: Namespaced
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: |-
|
||||
NodeFeature resource holds the features discovered for one node in the
|
||||
cluster.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Specification of the NodeFeature, containing features discovered
|
||||
for a node.
|
||||
properties:
|
||||
features:
|
||||
description: Features is the full "raw" features data that has been
|
||||
discovered.
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
description: AttributeFeatureSet is a set of features having
|
||||
string value.
|
||||
properties:
|
||||
elements:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Individual features of the feature set.
|
||||
type: object
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Attributes contains all the attribute-type features
|
||||
of the node.
|
||||
type: object
|
||||
flags:
|
||||
additionalProperties:
|
||||
description: FlagFeatureSet is a set of simple features only
|
||||
containing names without values.
|
||||
properties:
|
||||
elements:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
Nil is a dummy empty struct for protobuf compatibility.
|
||||
NOTE: protobuf definitions have been removed but this is kept for API compatibility.
|
||||
type: object
|
||||
description: Individual features of the feature set.
|
||||
type: object
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Flags contains all the flag-type features of the
|
||||
node.
|
||||
type: object
|
||||
instances:
|
||||
additionalProperties:
|
||||
description: InstanceFeatureSet is a set of features each of
|
||||
which is an instance having multiple attributes.
|
||||
properties:
|
||||
elements:
|
||||
description: Individual features of the feature set.
|
||||
items:
|
||||
description: InstanceFeature represents one instance of
|
||||
a complex features, e.g. a device.
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Attributes of the instance feature.
|
||||
type: object
|
||||
required:
|
||||
- attributes
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Instances contains all the instance-type features
|
||||
of the node.
|
||||
type: object
|
||||
type: object
|
||||
labels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Labels is the set of node labels that are requested to
|
||||
be created.
|
||||
type: object
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeaturegroups.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeatureGroup
|
||||
listKind: NodeFeatureGroupList
|
||||
plural: nodefeaturegroups
|
||||
shortNames:
|
||||
- nfg
|
||||
singular: nodefeaturegroup
|
||||
scope: Namespaced
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: NodeFeatureGroup resource holds Node pools by featureGroup
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Spec defines the rules to be evaluated.
|
||||
properties:
|
||||
featureGroupRules:
|
||||
description: List of rules to evaluate to determine nodes that belong
|
||||
in this group.
|
||||
items:
|
||||
description: GroupRule defines a rule for nodegroup filtering.
|
||||
properties:
|
||||
matchAny:
|
||||
description: MatchAny specifies a list of matchers one of which
|
||||
must match.
|
||||
items:
|
||||
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||
properties:
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher
|
||||
terms all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature
|
||||
set to match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- matchFeatures
|
||||
type: object
|
||||
type: array
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher terms
|
||||
all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature set to
|
||||
match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
name:
|
||||
description: Name of the rule.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- featureGroupRules
|
||||
type: object
|
||||
status:
|
||||
description: |-
|
||||
Status of the NodeFeatureGroup after the most recent evaluation of the
|
||||
specification.
|
||||
properties:
|
||||
nodes:
|
||||
description: Nodes is a list of FeatureGroupNode in the cluster that
|
||||
match the featureGroupRules
|
||||
items:
|
||||
properties:
|
||||
name:
|
||||
description: Name of the node.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
x-kubernetes-list-map-keys:
|
||||
- name
|
||||
x-kubernetes-list-type: map
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
subresources:
|
||||
status: {}
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeaturerules.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeatureRule
|
||||
listKind: NodeFeatureRuleList
|
||||
plural: nodefeaturerules
|
||||
shortNames:
|
||||
- nfr
|
||||
singular: nodefeaturerule
|
||||
scope: Cluster
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: |-
|
||||
NodeFeatureRule resource specifies a configuration for feature-based
|
||||
customization of node objects, such as node labeling.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Spec defines the rules to be evaluated.
|
||||
properties:
|
||||
rules:
|
||||
description: Rules is a list of node customization rules.
|
||||
items:
|
||||
description: Rule defines a rule for node customization such as
|
||||
labeling.
|
||||
properties:
|
||||
annotations:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Annotations to create if the rule matches.
|
||||
type: object
|
||||
extendedResources:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: ExtendedResources to create if the rule matches.
|
||||
type: object
|
||||
labels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Labels to create if the rule matches.
|
||||
type: object
|
||||
labelsTemplate:
|
||||
description: |-
|
||||
LabelsTemplate specifies a template to expand for dynamically generating
|
||||
multiple labels. Data (after template expansion) must be keys with an
|
||||
optional value (<key>[=<value>]) separated by newlines.
|
||||
type: string
|
||||
matchAny:
|
||||
description: MatchAny specifies a list of matchers one of which
|
||||
must match.
|
||||
items:
|
||||
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||
properties:
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher
|
||||
terms all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature
|
||||
set to match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- matchFeatures
|
||||
type: object
|
||||
type: array
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher terms
|
||||
all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature set to
|
||||
match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
name:
|
||||
description: Name of the rule.
|
||||
type: string
|
||||
taints:
|
||||
description: Taints to create if the rule matches.
|
||||
items:
|
||||
description: |-
|
||||
The node this Taint is attached to has the "effect" on
|
||||
any pod that does not tolerate the Taint.
|
||||
properties:
|
||||
effect:
|
||||
description: |-
|
||||
Required. The effect of the taint on pods
|
||||
that do not tolerate the taint.
|
||||
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
|
||||
type: string
|
||||
key:
|
||||
description: Required. The taint key to be applied to
|
||||
a node.
|
||||
type: string
|
||||
timeAdded:
|
||||
description: |-
|
||||
TimeAdded represents the time at which the taint was added.
|
||||
It is only written for NoExecute taints.
|
||||
format: date-time
|
||||
type: string
|
||||
value:
|
||||
description: The taint value corresponding to the taint
|
||||
key.
|
||||
type: string
|
||||
required:
|
||||
- effect
|
||||
- key
|
||||
type: object
|
||||
type: array
|
||||
vars:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: |-
|
||||
Vars is the variables to store if the rule matches. Variables do not
|
||||
directly inflict any changes in the node object. However, they can be
|
||||
referenced from other rules enabling more complex rule hierarchies,
|
||||
without exposing intermediary output values as labels.
|
||||
type: object
|
||||
varsTemplate:
|
||||
description: |-
|
||||
VarsTemplate specifies a template to expand for dynamically generating
|
||||
multiple variables. Data (after template expansion) must be keys with an
|
||||
optional value (<key>[=<value>]) separated by newlines.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- rules
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
@@ -0,0 +1,86 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-feature-discovery-worker
|
||||
namespace: node-feature-discovery
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: node-feature-discovery-worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: node-feature-discovery-worker
|
||||
spec:
|
||||
serviceAccountName: node-feature-discovery
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: worker
|
||||
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
resources:
|
||||
limits:
|
||||
memory: 512Mi
|
||||
requests:
|
||||
cpu: 5m
|
||||
memory: 64Mi
|
||||
command:
|
||||
- "nfd-worker"
|
||||
args:
|
||||
- "-metrics=8081"
|
||||
- "-grpc-health=8082"
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: metrics
|
||||
- containerPort: 8082
|
||||
name: health
|
||||
volumeMounts:
|
||||
- name: host-boot
|
||||
mountPath: "/host-boot"
|
||||
readOnly: true
|
||||
- name: host-os-release
|
||||
mountPath: "/host-etc/os-release"
|
||||
readOnly: true
|
||||
- name: host-sys
|
||||
mountPath: "/host-sys"
|
||||
readOnly: true
|
||||
- name: host-usr-lib
|
||||
mountPath: "/host-usr/lib"
|
||||
readOnly: true
|
||||
- name: host-lib
|
||||
mountPath: "/host-lib"
|
||||
readOnly: true
|
||||
- name: host-proc-swaps
|
||||
mountPath: "/host-proc/swaps"
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: host-boot
|
||||
hostPath:
|
||||
path: "/boot"
|
||||
- name: host-os-release
|
||||
hostPath:
|
||||
path: "/etc/os-release"
|
||||
- name: host-sys
|
||||
hostPath:
|
||||
path: "/sys"
|
||||
- name: host-usr-lib
|
||||
hostPath:
|
||||
path: "/usr/lib"
|
||||
- name: host-lib
|
||||
hostPath:
|
||||
path: "/lib"
|
||||
- name: host-proc-swaps
|
||||
hostPath:
|
||||
path: "/proc/swaps"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: node-feature-discovery
|
||||
labels:
|
||||
- pairs:
|
||||
app.kubernetes.io/name: node-feature-discovery
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- crds.yaml
|
||||
- rbac.yaml
|
||||
- daemonset.yaml
|
||||
- master.yaml
|
||||
@@ -0,0 +1,49 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: node-feature-discovery-master
|
||||
namespace: node-feature-discovery
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
name: node-feature-discovery-master
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: node-feature-discovery-master
|
||||
spec:
|
||||
serviceAccountName: node-feature-discovery
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: master
|
||||
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
command:
|
||||
- "nfd-master"
|
||||
args:
|
||||
- "-metrics=8081"
|
||||
- "-grpc-health=8082"
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: metrics
|
||||
- containerPort: 8082
|
||||
name: health
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
memory: 128Mi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
labels:
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
@@ -0,0 +1,55 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
namespace: node-feature-discovery
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/status
|
||||
verbs:
|
||||
- get
|
||||
- patch
|
||||
- update
|
||||
- list
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- namespaces
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- nfd.k8s-sigs.io
|
||||
resources:
|
||||
- nodefeatures
|
||||
- nodefeaturerules
|
||||
- nodefeaturegroups
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: node-feature-discovery
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-feature-discovery
|
||||
namespace: node-feature-discovery
|
||||
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# NVIDIA Device Plugin
|
||||
|
||||
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
|
||||
|
||||
## Overview
|
||||
|
||||
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
|
||||
- Discovers NVIDIA GPUs on worker nodes
|
||||
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
|
||||
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
|
||||
- Enables pods to request GPU resources
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the NVIDIA Device Plugin, ensure that:
|
||||
|
||||
1. **NVIDIA Drivers** are installed (>= 384.81)
|
||||
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
|
||||
3. **nvidia-container-runtime** is configured as the default container runtime
|
||||
4. Worker nodes have NVIDIA GPUs
|
||||
|
||||
### Talos Linux Requirements
|
||||
|
||||
For Talos Linux nodes, you need:
|
||||
- NVIDIA drivers extension in the Talos schematic
|
||||
- nvidia-container-toolkit extension
|
||||
- Proper container runtime configuration
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Configure and install the service
|
||||
wild-cluster-services-configure nvidia-device-plugin
|
||||
wild-cluster-install nvidia-device-plugin
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
After installation, verify the plugin is working:
|
||||
|
||||
```bash
|
||||
# Check plugin pods are running
|
||||
kubectl get pods -n kube-system | grep nvidia
|
||||
|
||||
# Verify GPU resources are advertised
|
||||
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
|
||||
|
||||
# Check GPU node labels
|
||||
kubectl get nodes --show-labels | grep nvidia
|
||||
```
|
||||
|
||||
## Usage in Applications
|
||||
|
||||
Once installed, applications can request GPU resources:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gpu-app
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: nvidia/cuda:latest
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Plugin Not Starting
|
||||
- Verify NVIDIA drivers are installed on worker nodes
|
||||
- Check that nvidia-container-toolkit is properly configured
|
||||
- Ensure worker nodes are not tainted in a way that prevents scheduling
|
||||
|
||||
### No GPU Resources Advertised
|
||||
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
|
||||
- Verify NVIDIA runtime is the default container runtime
|
||||
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
|
||||
|
||||
## Configuration
|
||||
|
||||
The plugin uses the following configuration:
|
||||
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
|
||||
- **Namespace**: `kube-system`
|
||||
- **Priority Class**: `system-node-critical`
|
||||
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
|
||||
|
||||
## References
|
||||
|
||||
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
|
||||
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
|
||||
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
|
||||
51
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
51
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
|
||||
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
|
||||
|
||||
print_header "Setting up NVIDIA Device Plugin"
|
||||
|
||||
# Check if we have NVIDIA GPUs in the cluster
|
||||
print_info "Checking for NVIDIA GPUs in the cluster..."
|
||||
|
||||
# Check if any worker nodes exist (device plugin only runs on worker nodes)
|
||||
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
|
||||
if [ "$WORKER_NODES" -eq 0 ]; then
|
||||
print_error "No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Found $WORKER_NODES worker node(s)"
|
||||
|
||||
# Templates should already be compiled by wild-cluster-services-generate
|
||||
echo "Using pre-compiled NVIDIA Device Plugin templates..."
|
||||
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
|
||||
echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Deploying NVIDIA Device Plugin..."
|
||||
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
|
||||
|
||||
print_info "Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
|
||||
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
|
||||
|
||||
print_success "NVIDIA Device Plugin installed successfully"
|
||||
echo ""
|
||||
echo "To verify the installation:"
|
||||
echo " kubectl get pods -n kube-system | grep nvidia"
|
||||
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
|
||||
echo ""
|
||||
echo "GPU nodes should now be labeled with GPU product information:"
|
||||
echo " kubectl get nodes --show-labels | grep nvidia"
|
||||
@@ -0,0 +1,91 @@
|
||||
# NVIDIA Device Plugin DaemonSet
|
||||
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
|
||||
# Licensed under the Apache License, Version 2.0
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
spec:
|
||||
runtimeClassName: nvidia
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: feature.node.kubernetes.io/pci-0300_10de.present
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
|
||||
name: nvidia-device-plugin-ctr
|
||||
env:
|
||||
- name: MPS_ROOT
|
||||
value: /run/nvidia/mps
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
- name: FAIL_ON_INIT_ERROR
|
||||
value: "false"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: mps-shm
|
||||
mountPath: /dev/shm
|
||||
- name: mps-root
|
||||
mountPath: /mps
|
||||
- name: cdi-root
|
||||
mountPath: /var/run/cdi
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: mps-root
|
||||
hostPath:
|
||||
path: /run/nvidia/mps
|
||||
type: DirectoryOrCreate
|
||||
- name: mps-shm
|
||||
hostPath:
|
||||
path: /run/nvidia/mps/shm
|
||||
- name: cdi-root
|
||||
hostPath:
|
||||
path: /var/run/cdi
|
||||
type: DirectoryOrCreate
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: kube-system
|
||||
resources:
|
||||
- daemonset.yaml
|
||||
- runtimeclass.yaml
|
||||
labels:
|
||||
- pairs:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
Reference in New Issue
Block a user