Add nvidia-plugin and node-feature-discovery to cluster setup.

This commit is contained in:
2025-09-24 04:35:28 -07:00
parent 0dc8696820
commit ad1cec9a59
13 changed files with 1220 additions and 0 deletions

View File

@@ -0,0 +1,40 @@
#!/bin/bash
set -e
set -o pipefail
# Initialize Wild Cloud environment
if [ -z "${WC_ROOT}" ]; then
print "WC_ROOT is not set."
exit 1
else
source "${WC_ROOT}/scripts/common.sh"
init_wild_env
fi
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
print_header "Setting up Node Feature Discovery"
# Templates should already be compiled by wild-cluster-services-generate
info "Using pre-compiled Node Feature Discovery templates..."
if [ ! -d "${NFD_DIR}/kustomize" ]; then
error "Compiled templates not found. Run 'wild-cluster-services-configure node-feature-discovery' first."
exit 1
fi
info "Deploying Node Feature Discovery..."
kubectl apply -k "${NFD_DIR}/kustomize"
info "Waiting for Node Feature Discovery DaemonSet to be ready..."
kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
success "Node Feature Discovery installed successfully"
echo ""
echo "To verify the installation:"
echo " kubectl get pods -n node-feature-discovery"
echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
echo ""
echo "GPU nodes should now be labeled with GPU device information:"
echo " kubectl get nodes --show-labels | grep pci-10de"

View File

@@ -0,0 +1,711 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeatures.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeature
listKind: NodeFeatureList
plural: nodefeatures
singular: nodefeature
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeature resource holds the features discovered for one node in the
cluster.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of the NodeFeature, containing features discovered
for a node.
properties:
features:
description: Features is the full "raw" features data that has been
discovered.
properties:
attributes:
additionalProperties:
description: AttributeFeatureSet is a set of features having
string value.
properties:
elements:
additionalProperties:
type: string
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Attributes contains all the attribute-type features
of the node.
type: object
flags:
additionalProperties:
description: FlagFeatureSet is a set of simple features only
containing names without values.
properties:
elements:
additionalProperties:
description: |-
Nil is a dummy empty struct for protobuf compatibility.
NOTE: protobuf definitions have been removed but this is kept for API compatibility.
type: object
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Flags contains all the flag-type features of the
node.
type: object
instances:
additionalProperties:
description: InstanceFeatureSet is a set of features each of
which is an instance having multiple attributes.
properties:
elements:
description: Individual features of the feature set.
items:
description: InstanceFeature represents one instance of
a complex features, e.g. a device.
properties:
attributes:
additionalProperties:
type: string
description: Attributes of the instance feature.
type: object
required:
- attributes
type: object
type: array
required:
- elements
type: object
description: Instances contains all the instance-type features
of the node.
type: object
type: object
labels:
additionalProperties:
type: string
description: Labels is the set of node labels that are requested to
be created.
type: object
type: object
required:
- spec
type: object
served: true
storage: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeaturegroups.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureGroup
listKind: NodeFeatureGroupList
plural: nodefeaturegroups
shortNames:
- nfg
singular: nodefeaturegroup
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeFeatureGroup resource holds Node pools by featureGroup
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
featureGroupRules:
description: List of rules to evaluate to determine nodes that belong
in this group.
items:
description: GroupRule defines a rule for nodegroup filtering.
properties:
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
required:
- name
type: object
type: array
required:
- featureGroupRules
type: object
status:
description: |-
Status of the NodeFeatureGroup after the most recent evaluation of the
specification.
properties:
nodes:
description: Nodes is a list of FeatureGroupNode in the cluster that
match the featureGroupRules
items:
properties:
name:
description: Name of the node.
type: string
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeaturerules.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureRule
listKind: NodeFeatureRuleList
plural: nodefeaturerules
shortNames:
- nfr
singular: nodefeaturerule
scope: Cluster
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeatureRule resource specifies a configuration for feature-based
customization of node objects, such as node labeling.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
rules:
description: Rules is a list of node customization rules.
items:
description: Rule defines a rule for node customization such as
labeling.
properties:
annotations:
additionalProperties:
type: string
description: Annotations to create if the rule matches.
type: object
extendedResources:
additionalProperties:
type: string
description: ExtendedResources to create if the rule matches.
type: object
labels:
additionalProperties:
type: string
description: Labels to create if the rule matches.
type: object
labelsTemplate:
description: |-
LabelsTemplate specifies a template to expand for dynamically generating
multiple labels. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
taints:
description: Taints to create if the rule matches.
items:
description: |-
The node this Taint is attached to has the "effect" on
any pod that does not tolerate the Taint.
properties:
effect:
description: |-
Required. The effect of the taint on pods
that do not tolerate the taint.
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Required. The taint key to be applied to
a node.
type: string
timeAdded:
description: |-
TimeAdded represents the time at which the taint was added.
It is only written for NoExecute taints.
format: date-time
type: string
value:
description: The taint value corresponding to the taint
key.
type: string
required:
- effect
- key
type: object
type: array
vars:
additionalProperties:
type: string
description: |-
Vars is the variables to store if the rule matches. Variables do not
directly inflict any changes in the node object. However, they can be
referenced from other rules enabling more complex rule hierarchies,
without exposing intermediary output values as labels.
type: object
varsTemplate:
description: |-
VarsTemplate specifies a template to expand for dynamically generating
multiple variables. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
required:
- name
type: object
type: array
required:
- rules
type: object
required:
- spec
type: object
served: true
storage: true

View File

@@ -0,0 +1,86 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-feature-discovery-worker
namespace: node-feature-discovery
spec:
selector:
matchLabels:
name: node-feature-discovery-worker
template:
metadata:
labels:
name: node-feature-discovery-worker
spec:
serviceAccountName: node-feature-discovery
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- name: worker
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
runAsNonRoot: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
resources:
limits:
memory: 512Mi
requests:
cpu: 5m
memory: 64Mi
command:
- "nfd-worker"
args:
- "-metrics=8081"
- "-grpc-health=8082"
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
readOnly: true
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
- name: host-sys
mountPath: "/host-sys"
readOnly: true
- name: host-usr-lib
mountPath: "/host-usr/lib"
readOnly: true
- name: host-lib
mountPath: "/host-lib"
readOnly: true
- name: host-proc-swaps
mountPath: "/host-proc/swaps"
readOnly: true
volumes:
- name: host-boot
hostPath:
path: "/boot"
- name: host-os-release
hostPath:
path: "/etc/os-release"
- name: host-sys
hostPath:
path: "/sys"
- name: host-usr-lib
hostPath:
path: "/usr/lib"
- name: host-lib
hostPath:
path: "/lib"
- name: host-proc-swaps
hostPath:
path: "/proc/swaps"

View File

@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: node-feature-discovery
labels:
- pairs:
app.kubernetes.io/name: node-feature-discovery
managedBy: kustomize
partOf: wild-cloud
resources:
- namespace.yaml
- crds.yaml
- rbac.yaml
- daemonset.yaml
- master.yaml

View File

@@ -0,0 +1,49 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: node-feature-discovery-master
namespace: node-feature-discovery
spec:
replicas: 1
selector:
matchLabels:
name: node-feature-discovery-master
template:
metadata:
labels:
name: node-feature-discovery-master
spec:
serviceAccountName: node-feature-discovery
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- name: master
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
runAsNonRoot: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "nfd-master"
args:
- "-metrics=8081"
- "-grpc-health=8082"
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
resources:
requests:
cpu: 10m
memory: 64Mi
limits:
memory: 128Mi

View File

@@ -0,0 +1,8 @@
apiVersion: v1
kind: Namespace
metadata:
name: node-feature-discovery
labels:
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/warn: privileged

View File

@@ -0,0 +1,55 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-feature-discovery
namespace: node-feature-discovery
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- list
- watch
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
- nodefeaturerules
- nodefeaturegroups
verbs:
- get
- list
- watch
- create
- update
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-feature-discovery
subjects:
- kind: ServiceAccount
name: node-feature-discovery
namespace: node-feature-discovery

View File

@@ -0,0 +1,98 @@
# NVIDIA Device Plugin
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
## Overview
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
- Discovers NVIDIA GPUs on worker nodes
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
- Enables pods to request GPU resources
## Prerequisites
Before installing the NVIDIA Device Plugin, ensure that:
1. **NVIDIA Drivers** are installed (>= 384.81)
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
3. **nvidia-container-runtime** is configured as the default container runtime
4. Worker nodes have NVIDIA GPUs
### Talos Linux Requirements
For Talos Linux nodes, you need:
- NVIDIA drivers extension in the Talos schematic
- nvidia-container-toolkit extension
- Proper container runtime configuration
## Installation
```bash
# Configure and install the service
wild-cluster-services-configure nvidia-device-plugin
wild-cluster-install nvidia-device-plugin
```
## Verification
After installation, verify the plugin is working:
```bash
# Check plugin pods are running
kubectl get pods -n kube-system | grep nvidia
# Verify GPU resources are advertised
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
# Check GPU node labels
kubectl get nodes --show-labels | grep nvidia
```
## Usage in Applications
Once installed, applications can request GPU resources:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-app
spec:
template:
spec:
containers:
- name: app
image: nvidia/cuda:latest
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1
```
## Troubleshooting
### Plugin Not Starting
- Verify NVIDIA drivers are installed on worker nodes
- Check that nvidia-container-toolkit is properly configured
- Ensure worker nodes are not tainted in a way that prevents scheduling
### No GPU Resources Advertised
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
- Verify NVIDIA runtime is the default container runtime
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
## Configuration
The plugin uses the following configuration:
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
- **Namespace**: `kube-system`
- **Priority Class**: `system-node-critical`
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
## References
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)

View File

@@ -0,0 +1,51 @@
#!/bin/bash
set -e
set -o pipefail
# Initialize Wild Cloud environment
if [ -z "${WC_ROOT}" ]; then
print "WC_ROOT is not set."
exit 1
else
source "${WC_ROOT}/scripts/common.sh"
init_wild_env
fi
CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
print_header "Setting up NVIDIA Device Plugin"
# Check if we have NVIDIA GPUs in the cluster
print_info "Checking for NVIDIA GPUs in the cluster..."
# Check if any worker nodes exist (device plugin only runs on worker nodes)
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
if [ "$WORKER_NODES" -eq 0 ]; then
print_error "No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
exit 1
fi
print_info "Found $WORKER_NODES worker node(s)"
# Templates should already be compiled by wild-cluster-services-generate
echo "Using pre-compiled NVIDIA Device Plugin templates..."
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first."
exit 1
fi
print_info "Deploying NVIDIA Device Plugin..."
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
print_info "Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
print_success "NVIDIA Device Plugin installed successfully"
echo ""
echo "To verify the installation:"
echo " kubectl get pods -n kube-system | grep nvidia"
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
echo ""
echo "GPU nodes should now be labeled with GPU product information:"
echo " kubectl get nodes --show-labels | grep nvidia"

View File

@@ -0,0 +1,91 @@
# NVIDIA Device Plugin DaemonSet
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
# Licensed under the Apache License, Version 2.0
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
labels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
spec:
runtimeClassName: nvidia
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: CriticalAddonsOnly
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: feature.node.kubernetes.io/pci-0300_10de.present
operator: In
values:
- "true"
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
name: nvidia-device-plugin-ctr
env:
- name: MPS_ROOT
value: /run/nvidia/mps
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: mps-shm
mountPath: /dev/shm
- name: mps-root
mountPath: /mps
- name: cdi-root
mountPath: /var/run/cdi
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
path: /run/nvidia/mps
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /run/nvidia/mps/shm
- name: cdi-root
hostPath:
path: /var/run/cdi
type: DirectoryOrCreate

View File

@@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kube-system
resources:
- daemonset.yaml
- runtimeclass.yaml
labels:
- pairs:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud

View File

@@ -0,0 +1,5 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia