Add nvidia-plugin and node-feature-discovery to cluster setup.
This commit is contained in:
40
setup/cluster-services/node-feature-discovery/install.sh
Executable file
40
setup/cluster-services/node-feature-discovery/install.sh
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# Initialize Wild Cloud environment
|
||||||
|
if [ -z "${WC_ROOT}" ]; then
|
||||||
|
print "WC_ROOT is not set."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
source "${WC_ROOT}/scripts/common.sh"
|
||||||
|
init_wild_env
|
||||||
|
fi
|
||||||
|
|
||||||
|
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
|
||||||
|
NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
|
||||||
|
|
||||||
|
print_header "Setting up Node Feature Discovery"
|
||||||
|
|
||||||
|
# Templates should already be compiled by wild-cluster-services-generate
|
||||||
|
info "Using pre-compiled Node Feature Discovery templates..."
|
||||||
|
if [ ! -d "${NFD_DIR}/kustomize" ]; then
|
||||||
|
error "Compiled templates not found. Run 'wild-cluster-services-configure node-feature-discovery' first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
info "Deploying Node Feature Discovery..."
|
||||||
|
kubectl apply -k "${NFD_DIR}/kustomize"
|
||||||
|
|
||||||
|
info "Waiting for Node Feature Discovery DaemonSet to be ready..."
|
||||||
|
kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
|
||||||
|
|
||||||
|
success "Node Feature Discovery installed successfully"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "To verify the installation:"
|
||||||
|
echo " kubectl get pods -n node-feature-discovery"
|
||||||
|
echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
|
||||||
|
echo ""
|
||||||
|
echo "GPU nodes should now be labeled with GPU device information:"
|
||||||
|
echo " kubectl get nodes --show-labels | grep pci-10de"
|
||||||
@@ -0,0 +1,711 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apiextensions.k8s.io/v1
|
||||||
|
kind: CustomResourceDefinition
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
controller-gen.kubebuilder.io/version: v0.16.3
|
||||||
|
name: nodefeatures.nfd.k8s-sigs.io
|
||||||
|
spec:
|
||||||
|
group: nfd.k8s-sigs.io
|
||||||
|
names:
|
||||||
|
kind: NodeFeature
|
||||||
|
listKind: NodeFeatureList
|
||||||
|
plural: nodefeatures
|
||||||
|
singular: nodefeature
|
||||||
|
scope: Namespaced
|
||||||
|
versions:
|
||||||
|
- name: v1alpha1
|
||||||
|
schema:
|
||||||
|
openAPIV3Schema:
|
||||||
|
description: |-
|
||||||
|
NodeFeature resource holds the features discovered for one node in the
|
||||||
|
cluster.
|
||||||
|
properties:
|
||||||
|
apiVersion:
|
||||||
|
description: |-
|
||||||
|
APIVersion defines the versioned schema of this representation of an object.
|
||||||
|
Servers should convert recognized schemas to the latest internal value, and
|
||||||
|
may reject unrecognized values.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||||
|
type: string
|
||||||
|
kind:
|
||||||
|
description: |-
|
||||||
|
Kind is a string value representing the REST resource this object represents.
|
||||||
|
Servers may infer this from the endpoint the client submits requests to.
|
||||||
|
Cannot be updated.
|
||||||
|
In CamelCase.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||||
|
type: string
|
||||||
|
metadata:
|
||||||
|
type: object
|
||||||
|
spec:
|
||||||
|
description: Specification of the NodeFeature, containing features discovered
|
||||||
|
for a node.
|
||||||
|
properties:
|
||||||
|
features:
|
||||||
|
description: Features is the full "raw" features data that has been
|
||||||
|
discovered.
|
||||||
|
properties:
|
||||||
|
attributes:
|
||||||
|
additionalProperties:
|
||||||
|
description: AttributeFeatureSet is a set of features having
|
||||||
|
string value.
|
||||||
|
properties:
|
||||||
|
elements:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: Individual features of the feature set.
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- elements
|
||||||
|
type: object
|
||||||
|
description: Attributes contains all the attribute-type features
|
||||||
|
of the node.
|
||||||
|
type: object
|
||||||
|
flags:
|
||||||
|
additionalProperties:
|
||||||
|
description: FlagFeatureSet is a set of simple features only
|
||||||
|
containing names without values.
|
||||||
|
properties:
|
||||||
|
elements:
|
||||||
|
additionalProperties:
|
||||||
|
description: |-
|
||||||
|
Nil is a dummy empty struct for protobuf compatibility.
|
||||||
|
NOTE: protobuf definitions have been removed but this is kept for API compatibility.
|
||||||
|
type: object
|
||||||
|
description: Individual features of the feature set.
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- elements
|
||||||
|
type: object
|
||||||
|
description: Flags contains all the flag-type features of the
|
||||||
|
node.
|
||||||
|
type: object
|
||||||
|
instances:
|
||||||
|
additionalProperties:
|
||||||
|
description: InstanceFeatureSet is a set of features each of
|
||||||
|
which is an instance having multiple attributes.
|
||||||
|
properties:
|
||||||
|
elements:
|
||||||
|
description: Individual features of the feature set.
|
||||||
|
items:
|
||||||
|
description: InstanceFeature represents one instance of
|
||||||
|
a complex features, e.g. a device.
|
||||||
|
properties:
|
||||||
|
attributes:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: Attributes of the instance feature.
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- attributes
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- elements
|
||||||
|
type: object
|
||||||
|
description: Instances contains all the instance-type features
|
||||||
|
of the node.
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
|
labels:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: Labels is the set of node labels that are requested to
|
||||||
|
be created.
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- spec
|
||||||
|
type: object
|
||||||
|
served: true
|
||||||
|
storage: true
|
||||||
|
---
|
||||||
|
apiVersion: apiextensions.k8s.io/v1
|
||||||
|
kind: CustomResourceDefinition
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
controller-gen.kubebuilder.io/version: v0.16.3
|
||||||
|
name: nodefeaturegroups.nfd.k8s-sigs.io
|
||||||
|
spec:
|
||||||
|
group: nfd.k8s-sigs.io
|
||||||
|
names:
|
||||||
|
kind: NodeFeatureGroup
|
||||||
|
listKind: NodeFeatureGroupList
|
||||||
|
plural: nodefeaturegroups
|
||||||
|
shortNames:
|
||||||
|
- nfg
|
||||||
|
singular: nodefeaturegroup
|
||||||
|
scope: Namespaced
|
||||||
|
versions:
|
||||||
|
- name: v1alpha1
|
||||||
|
schema:
|
||||||
|
openAPIV3Schema:
|
||||||
|
description: NodeFeatureGroup resource holds Node pools by featureGroup
|
||||||
|
properties:
|
||||||
|
apiVersion:
|
||||||
|
description: |-
|
||||||
|
APIVersion defines the versioned schema of this representation of an object.
|
||||||
|
Servers should convert recognized schemas to the latest internal value, and
|
||||||
|
may reject unrecognized values.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||||
|
type: string
|
||||||
|
kind:
|
||||||
|
description: |-
|
||||||
|
Kind is a string value representing the REST resource this object represents.
|
||||||
|
Servers may infer this from the endpoint the client submits requests to.
|
||||||
|
Cannot be updated.
|
||||||
|
In CamelCase.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||||
|
type: string
|
||||||
|
metadata:
|
||||||
|
type: object
|
||||||
|
spec:
|
||||||
|
description: Spec defines the rules to be evaluated.
|
||||||
|
properties:
|
||||||
|
featureGroupRules:
|
||||||
|
description: List of rules to evaluate to determine nodes that belong
|
||||||
|
in this group.
|
||||||
|
items:
|
||||||
|
description: GroupRule defines a rule for nodegroup filtering.
|
||||||
|
properties:
|
||||||
|
matchAny:
|
||||||
|
description: MatchAny specifies a list of matchers one of which
|
||||||
|
must match.
|
||||||
|
items:
|
||||||
|
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||||
|
properties:
|
||||||
|
matchFeatures:
|
||||||
|
description: MatchFeatures specifies a set of matcher
|
||||||
|
terms all of which must match.
|
||||||
|
items:
|
||||||
|
description: |-
|
||||||
|
FeatureMatcherTerm defines requirements against one feature set. All
|
||||||
|
requirements (specified as MatchExpressions) are evaluated against each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
feature:
|
||||||
|
description: Feature is the name of the feature
|
||||||
|
set to match against.
|
||||||
|
type: string
|
||||||
|
matchExpressions:
|
||||||
|
additionalProperties:
|
||||||
|
description: |-
|
||||||
|
MatchExpression specifies an expression to evaluate against a set of input
|
||||||
|
values. It contains an operator that is applied when matching the input and
|
||||||
|
an array of values that the operator evaluates the input against.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
description: |-
|
||||||
|
MatchExpressions is the set of per-element expressions evaluated. These
|
||||||
|
match against the value of the specified elements.
|
||||||
|
type: object
|
||||||
|
matchName:
|
||||||
|
description: |-
|
||||||
|
MatchName in an expression that is matched against the name of each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- feature
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- matchFeatures
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
matchFeatures:
|
||||||
|
description: MatchFeatures specifies a set of matcher terms
|
||||||
|
all of which must match.
|
||||||
|
items:
|
||||||
|
description: |-
|
||||||
|
FeatureMatcherTerm defines requirements against one feature set. All
|
||||||
|
requirements (specified as MatchExpressions) are evaluated against each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
feature:
|
||||||
|
description: Feature is the name of the feature set to
|
||||||
|
match against.
|
||||||
|
type: string
|
||||||
|
matchExpressions:
|
||||||
|
additionalProperties:
|
||||||
|
description: |-
|
||||||
|
MatchExpression specifies an expression to evaluate against a set of input
|
||||||
|
values. It contains an operator that is applied when matching the input and
|
||||||
|
an array of values that the operator evaluates the input against.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
description: |-
|
||||||
|
MatchExpressions is the set of per-element expressions evaluated. These
|
||||||
|
match against the value of the specified elements.
|
||||||
|
type: object
|
||||||
|
matchName:
|
||||||
|
description: |-
|
||||||
|
MatchName in an expression that is matched against the name of each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- feature
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
name:
|
||||||
|
description: Name of the rule.
|
||||||
|
type: string
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- featureGroupRules
|
||||||
|
type: object
|
||||||
|
status:
|
||||||
|
description: |-
|
||||||
|
Status of the NodeFeatureGroup after the most recent evaluation of the
|
||||||
|
specification.
|
||||||
|
properties:
|
||||||
|
nodes:
|
||||||
|
description: Nodes is a list of FeatureGroupNode in the cluster that
|
||||||
|
match the featureGroupRules
|
||||||
|
items:
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
description: Name of the node.
|
||||||
|
type: string
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
x-kubernetes-list-map-keys:
|
||||||
|
- name
|
||||||
|
x-kubernetes-list-type: map
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- spec
|
||||||
|
type: object
|
||||||
|
served: true
|
||||||
|
storage: true
|
||||||
|
subresources:
|
||||||
|
status: {}
|
||||||
|
---
|
||||||
|
apiVersion: apiextensions.k8s.io/v1
|
||||||
|
kind: CustomResourceDefinition
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
controller-gen.kubebuilder.io/version: v0.16.3
|
||||||
|
name: nodefeaturerules.nfd.k8s-sigs.io
|
||||||
|
spec:
|
||||||
|
group: nfd.k8s-sigs.io
|
||||||
|
names:
|
||||||
|
kind: NodeFeatureRule
|
||||||
|
listKind: NodeFeatureRuleList
|
||||||
|
plural: nodefeaturerules
|
||||||
|
shortNames:
|
||||||
|
- nfr
|
||||||
|
singular: nodefeaturerule
|
||||||
|
scope: Cluster
|
||||||
|
versions:
|
||||||
|
- name: v1alpha1
|
||||||
|
schema:
|
||||||
|
openAPIV3Schema:
|
||||||
|
description: |-
|
||||||
|
NodeFeatureRule resource specifies a configuration for feature-based
|
||||||
|
customization of node objects, such as node labeling.
|
||||||
|
properties:
|
||||||
|
apiVersion:
|
||||||
|
description: |-
|
||||||
|
APIVersion defines the versioned schema of this representation of an object.
|
||||||
|
Servers should convert recognized schemas to the latest internal value, and
|
||||||
|
may reject unrecognized values.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||||
|
type: string
|
||||||
|
kind:
|
||||||
|
description: |-
|
||||||
|
Kind is a string value representing the REST resource this object represents.
|
||||||
|
Servers may infer this from the endpoint the client submits requests to.
|
||||||
|
Cannot be updated.
|
||||||
|
In CamelCase.
|
||||||
|
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||||
|
type: string
|
||||||
|
metadata:
|
||||||
|
type: object
|
||||||
|
spec:
|
||||||
|
description: Spec defines the rules to be evaluated.
|
||||||
|
properties:
|
||||||
|
rules:
|
||||||
|
description: Rules is a list of node customization rules.
|
||||||
|
items:
|
||||||
|
description: Rule defines a rule for node customization such as
|
||||||
|
labeling.
|
||||||
|
properties:
|
||||||
|
annotations:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: Annotations to create if the rule matches.
|
||||||
|
type: object
|
||||||
|
extendedResources:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: ExtendedResources to create if the rule matches.
|
||||||
|
type: object
|
||||||
|
labels:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: Labels to create if the rule matches.
|
||||||
|
type: object
|
||||||
|
labelsTemplate:
|
||||||
|
description: |-
|
||||||
|
LabelsTemplate specifies a template to expand for dynamically generating
|
||||||
|
multiple labels. Data (after template expansion) must be keys with an
|
||||||
|
optional value (<key>[=<value>]) separated by newlines.
|
||||||
|
type: string
|
||||||
|
matchAny:
|
||||||
|
description: MatchAny specifies a list of matchers one of which
|
||||||
|
must match.
|
||||||
|
items:
|
||||||
|
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||||
|
properties:
|
||||||
|
matchFeatures:
|
||||||
|
description: MatchFeatures specifies a set of matcher
|
||||||
|
terms all of which must match.
|
||||||
|
items:
|
||||||
|
description: |-
|
||||||
|
FeatureMatcherTerm defines requirements against one feature set. All
|
||||||
|
requirements (specified as MatchExpressions) are evaluated against each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
feature:
|
||||||
|
description: Feature is the name of the feature
|
||||||
|
set to match against.
|
||||||
|
type: string
|
||||||
|
matchExpressions:
|
||||||
|
additionalProperties:
|
||||||
|
description: |-
|
||||||
|
MatchExpression specifies an expression to evaluate against a set of input
|
||||||
|
values. It contains an operator that is applied when matching the input and
|
||||||
|
an array of values that the operator evaluates the input against.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
description: |-
|
||||||
|
MatchExpressions is the set of per-element expressions evaluated. These
|
||||||
|
match against the value of the specified elements.
|
||||||
|
type: object
|
||||||
|
matchName:
|
||||||
|
description: |-
|
||||||
|
MatchName in an expression that is matched against the name of each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- feature
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- matchFeatures
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
matchFeatures:
|
||||||
|
description: MatchFeatures specifies a set of matcher terms
|
||||||
|
all of which must match.
|
||||||
|
items:
|
||||||
|
description: |-
|
||||||
|
FeatureMatcherTerm defines requirements against one feature set. All
|
||||||
|
requirements (specified as MatchExpressions) are evaluated against each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
feature:
|
||||||
|
description: Feature is the name of the feature set to
|
||||||
|
match against.
|
||||||
|
type: string
|
||||||
|
matchExpressions:
|
||||||
|
additionalProperties:
|
||||||
|
description: |-
|
||||||
|
MatchExpression specifies an expression to evaluate against a set of input
|
||||||
|
values. It contains an operator that is applied when matching the input and
|
||||||
|
an array of values that the operator evaluates the input against.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
description: |-
|
||||||
|
MatchExpressions is the set of per-element expressions evaluated. These
|
||||||
|
match against the value of the specified elements.
|
||||||
|
type: object
|
||||||
|
matchName:
|
||||||
|
description: |-
|
||||||
|
MatchName in an expression that is matched against the name of each
|
||||||
|
element in the feature set.
|
||||||
|
properties:
|
||||||
|
op:
|
||||||
|
description: Op is the operator to be applied.
|
||||||
|
enum:
|
||||||
|
- In
|
||||||
|
- NotIn
|
||||||
|
- InRegexp
|
||||||
|
- Exists
|
||||||
|
- DoesNotExist
|
||||||
|
- Gt
|
||||||
|
- Lt
|
||||||
|
- GtLt
|
||||||
|
- IsTrue
|
||||||
|
- IsFalse
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: |-
|
||||||
|
Value is the list of values that the operand evaluates the input
|
||||||
|
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||||
|
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||||
|
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||||
|
In other cases Value should contain at least one element.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- op
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- feature
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
name:
|
||||||
|
description: Name of the rule.
|
||||||
|
type: string
|
||||||
|
taints:
|
||||||
|
description: Taints to create if the rule matches.
|
||||||
|
items:
|
||||||
|
description: |-
|
||||||
|
The node this Taint is attached to has the "effect" on
|
||||||
|
any pod that does not tolerate the Taint.
|
||||||
|
properties:
|
||||||
|
effect:
|
||||||
|
description: |-
|
||||||
|
Required. The effect of the taint on pods
|
||||||
|
that do not tolerate the taint.
|
||||||
|
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
|
||||||
|
type: string
|
||||||
|
key:
|
||||||
|
description: Required. The taint key to be applied to
|
||||||
|
a node.
|
||||||
|
type: string
|
||||||
|
timeAdded:
|
||||||
|
description: |-
|
||||||
|
TimeAdded represents the time at which the taint was added.
|
||||||
|
It is only written for NoExecute taints.
|
||||||
|
format: date-time
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
description: The taint value corresponding to the taint
|
||||||
|
key.
|
||||||
|
type: string
|
||||||
|
required:
|
||||||
|
- effect
|
||||||
|
- key
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
vars:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: |-
|
||||||
|
Vars is the variables to store if the rule matches. Variables do not
|
||||||
|
directly inflict any changes in the node object. However, they can be
|
||||||
|
referenced from other rules enabling more complex rule hierarchies,
|
||||||
|
without exposing intermediary output values as labels.
|
||||||
|
type: object
|
||||||
|
varsTemplate:
|
||||||
|
description: |-
|
||||||
|
VarsTemplate specifies a template to expand for dynamically generating
|
||||||
|
multiple variables. Data (after template expansion) must be keys with an
|
||||||
|
optional value (<key>[=<value>]) separated by newlines.
|
||||||
|
type: string
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- rules
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- spec
|
||||||
|
type: object
|
||||||
|
served: true
|
||||||
|
storage: true
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery-worker
|
||||||
|
namespace: node-feature-discovery
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: node-feature-discovery-worker
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: node-feature-discovery-worker
|
||||||
|
spec:
|
||||||
|
serviceAccountName: node-feature-discovery
|
||||||
|
securityContext:
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
containers:
|
||||||
|
- name: worker
|
||||||
|
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsNonRoot: true
|
||||||
|
env:
|
||||||
|
- name: NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
requests:
|
||||||
|
cpu: 5m
|
||||||
|
memory: 64Mi
|
||||||
|
command:
|
||||||
|
- "nfd-worker"
|
||||||
|
args:
|
||||||
|
- "-metrics=8081"
|
||||||
|
- "-grpc-health=8082"
|
||||||
|
ports:
|
||||||
|
- containerPort: 8081
|
||||||
|
name: metrics
|
||||||
|
- containerPort: 8082
|
||||||
|
name: health
|
||||||
|
volumeMounts:
|
||||||
|
- name: host-boot
|
||||||
|
mountPath: "/host-boot"
|
||||||
|
readOnly: true
|
||||||
|
- name: host-os-release
|
||||||
|
mountPath: "/host-etc/os-release"
|
||||||
|
readOnly: true
|
||||||
|
- name: host-sys
|
||||||
|
mountPath: "/host-sys"
|
||||||
|
readOnly: true
|
||||||
|
- name: host-usr-lib
|
||||||
|
mountPath: "/host-usr/lib"
|
||||||
|
readOnly: true
|
||||||
|
- name: host-lib
|
||||||
|
mountPath: "/host-lib"
|
||||||
|
readOnly: true
|
||||||
|
- name: host-proc-swaps
|
||||||
|
mountPath: "/host-proc/swaps"
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: host-boot
|
||||||
|
hostPath:
|
||||||
|
path: "/boot"
|
||||||
|
- name: host-os-release
|
||||||
|
hostPath:
|
||||||
|
path: "/etc/os-release"
|
||||||
|
- name: host-sys
|
||||||
|
hostPath:
|
||||||
|
path: "/sys"
|
||||||
|
- name: host-usr-lib
|
||||||
|
hostPath:
|
||||||
|
path: "/usr/lib"
|
||||||
|
- name: host-lib
|
||||||
|
hostPath:
|
||||||
|
path: "/lib"
|
||||||
|
- name: host-proc-swaps
|
||||||
|
hostPath:
|
||||||
|
path: "/proc/swaps"
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: node-feature-discovery
|
||||||
|
labels:
|
||||||
|
- pairs:
|
||||||
|
app.kubernetes.io/name: node-feature-discovery
|
||||||
|
managedBy: kustomize
|
||||||
|
partOf: wild-cloud
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- crds.yaml
|
||||||
|
- rbac.yaml
|
||||||
|
- daemonset.yaml
|
||||||
|
- master.yaml
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery-master
|
||||||
|
namespace: node-feature-discovery
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: node-feature-discovery-master
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: node-feature-discovery-master
|
||||||
|
spec:
|
||||||
|
serviceAccountName: node-feature-discovery
|
||||||
|
securityContext:
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
containers:
|
||||||
|
- name: master
|
||||||
|
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsNonRoot: true
|
||||||
|
env:
|
||||||
|
- name: NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
command:
|
||||||
|
- "nfd-master"
|
||||||
|
args:
|
||||||
|
- "-metrics=8081"
|
||||||
|
- "-grpc-health=8082"
|
||||||
|
ports:
|
||||||
|
- containerPort: 8081
|
||||||
|
name: metrics
|
||||||
|
- containerPort: 8082
|
||||||
|
name: health
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
memory: 128Mi
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery
|
||||||
|
labels:
|
||||||
|
pod-security.kubernetes.io/enforce: privileged
|
||||||
|
pod-security.kubernetes.io/audit: privileged
|
||||||
|
pod-security.kubernetes.io/warn: privileged
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery
|
||||||
|
namespace: node-feature-discovery
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
- nodes/status
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- patch
|
||||||
|
- update
|
||||||
|
- list
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- namespaces
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- nfd.k8s-sigs.io
|
||||||
|
resources:
|
||||||
|
- nodefeatures
|
||||||
|
- nodefeaturerules
|
||||||
|
- nodefeaturegroups
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- create
|
||||||
|
- update
|
||||||
|
- patch
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: node-feature-discovery
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: node-feature-discovery
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: node-feature-discovery
|
||||||
|
namespace: node-feature-discovery
|
||||||
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# NVIDIA Device Plugin
|
||||||
|
|
||||||
|
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
|
||||||
|
- Discovers NVIDIA GPUs on worker nodes
|
||||||
|
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
|
||||||
|
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
|
||||||
|
- Enables pods to request GPU resources
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before installing the NVIDIA Device Plugin, ensure that:
|
||||||
|
|
||||||
|
1. **NVIDIA Drivers** are installed (>= 384.81)
|
||||||
|
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
|
||||||
|
3. **nvidia-container-runtime** is configured as the default container runtime
|
||||||
|
4. Worker nodes have NVIDIA GPUs
|
||||||
|
|
||||||
|
### Talos Linux Requirements
|
||||||
|
|
||||||
|
For Talos Linux nodes, you need:
|
||||||
|
- NVIDIA drivers extension in the Talos schematic
|
||||||
|
- nvidia-container-toolkit extension
|
||||||
|
- Proper container runtime configuration
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Configure and install the service
|
||||||
|
wild-cluster-services-configure nvidia-device-plugin
|
||||||
|
wild-cluster-install nvidia-device-plugin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
After installation, verify the plugin is working:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check plugin pods are running
|
||||||
|
kubectl get pods -n kube-system | grep nvidia
|
||||||
|
|
||||||
|
# Verify GPU resources are advertised
|
||||||
|
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
|
||||||
|
|
||||||
|
# Check GPU node labels
|
||||||
|
kubectl get nodes --show-labels | grep nvidia
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage in Applications
|
||||||
|
|
||||||
|
Once installed, applications can request GPU resources:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: gpu-app
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: app
|
||||||
|
image: nvidia/cuda:latest
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Plugin Not Starting
|
||||||
|
- Verify NVIDIA drivers are installed on worker nodes
|
||||||
|
- Check that nvidia-container-toolkit is properly configured
|
||||||
|
- Ensure worker nodes are not tainted in a way that prevents scheduling
|
||||||
|
|
||||||
|
### No GPU Resources Advertised
|
||||||
|
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
|
||||||
|
- Verify NVIDIA runtime is the default container runtime
|
||||||
|
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
The plugin uses the following configuration:
|
||||||
|
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
|
||||||
|
- **Namespace**: `kube-system`
|
||||||
|
- **Priority Class**: `system-node-critical`
|
||||||
|
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
|
||||||
|
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
|
||||||
|
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
|
||||||
51
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
51
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# Initialize Wild Cloud environment
|
||||||
|
if [ -z "${WC_ROOT}" ]; then
|
||||||
|
print "WC_ROOT is not set."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
source "${WC_ROOT}/scripts/common.sh"
|
||||||
|
init_wild_env
|
||||||
|
fi
|
||||||
|
|
||||||
|
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
|
||||||
|
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
|
||||||
|
|
||||||
|
print_header "Setting up NVIDIA Device Plugin"
|
||||||
|
|
||||||
|
# Check if we have NVIDIA GPUs in the cluster
|
||||||
|
print_info "Checking for NVIDIA GPUs in the cluster..."
|
||||||
|
|
||||||
|
# Check if any worker nodes exist (device plugin only runs on worker nodes)
|
||||||
|
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
|
||||||
|
if [ "$WORKER_NODES" -eq 0 ]; then
|
||||||
|
print_error "No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_info "Found $WORKER_NODES worker node(s)"
|
||||||
|
|
||||||
|
# Templates should already be compiled by wild-cluster-services-generate
|
||||||
|
echo "Using pre-compiled NVIDIA Device Plugin templates..."
|
||||||
|
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
|
||||||
|
echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_info "Deploying NVIDIA Device Plugin..."
|
||||||
|
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
|
||||||
|
|
||||||
|
print_info "Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
|
||||||
|
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
|
||||||
|
|
||||||
|
print_success "NVIDIA Device Plugin installed successfully"
|
||||||
|
echo ""
|
||||||
|
echo "To verify the installation:"
|
||||||
|
echo " kubectl get pods -n kube-system | grep nvidia"
|
||||||
|
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
|
||||||
|
echo ""
|
||||||
|
echo "GPU nodes should now be labeled with GPU product information:"
|
||||||
|
echo " kubectl get nodes --show-labels | grep nvidia"
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
# NVIDIA Device Plugin DaemonSet
|
||||||
|
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
|
||||||
|
# Licensed under the Apache License, Version 2.0
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-device-plugin-daemonset
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: nvidia-device-plugin
|
||||||
|
app.kubernetes.io/component: device-plugin
|
||||||
|
managedBy: kustomize
|
||||||
|
partOf: wild-cloud
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
app.kubernetes.io/name: nvidia-device-plugin
|
||||||
|
app.kubernetes.io/component: device-plugin
|
||||||
|
spec:
|
||||||
|
runtimeClassName: nvidia
|
||||||
|
tolerations:
|
||||||
|
- key: nvidia.com/gpu
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
- key: CriticalAddonsOnly
|
||||||
|
operator: Exists
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: feature.node.kubernetes.io/pci-0300_10de.present
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
|
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||||
|
# scheduler reserves resources for critical add-on pods so that they can
|
||||||
|
# be rescheduled after a failure.
|
||||||
|
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
securityContext:
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
|
||||||
|
name: nvidia-device-plugin-ctr
|
||||||
|
env:
|
||||||
|
- name: MPS_ROOT
|
||||||
|
value: /run/nvidia/mps
|
||||||
|
- name: NVIDIA_VISIBLE_DEVICES
|
||||||
|
value: all
|
||||||
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||||
|
value: compute,utility
|
||||||
|
- name: FAIL_ON_INIT_ERROR
|
||||||
|
value: "false"
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: mps-shm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
- name: mps-root
|
||||||
|
mountPath: /mps
|
||||||
|
- name: cdi-root
|
||||||
|
mountPath: /var/run/cdi
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: mps-root
|
||||||
|
hostPath:
|
||||||
|
path: /run/nvidia/mps
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: mps-shm
|
||||||
|
hostPath:
|
||||||
|
path: /run/nvidia/mps/shm
|
||||||
|
- name: cdi-root
|
||||||
|
hostPath:
|
||||||
|
path: /var/run/cdi
|
||||||
|
type: DirectoryOrCreate
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: kube-system
|
||||||
|
resources:
|
||||||
|
- daemonset.yaml
|
||||||
|
- runtimeclass.yaml
|
||||||
|
labels:
|
||||||
|
- pairs:
|
||||||
|
app.kubernetes.io/name: nvidia-device-plugin
|
||||||
|
app.kubernetes.io/component: device-plugin
|
||||||
|
managedBy: kustomize
|
||||||
|
partOf: wild-cloud
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: node.k8s.io/v1
|
||||||
|
kind: RuntimeClass
|
||||||
|
metadata:
|
||||||
|
name: nvidia
|
||||||
|
handler: nvidia
|
||||||
Reference in New Issue
Block a user