From ad1cec9a591f5fca6205318fd303a445afb400c3 Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Wed, 24 Sep 2025 04:35:28 -0700 Subject: [PATCH] Add nvidia-plugin and node-feature-discovery to cluster setup. --- .../node-feature-discovery/README.md | 0 .../node-feature-discovery/install.sh | 40 + .../kustomize.template/crds.yaml | 711 ++++++++++++++++++ .../kustomize.template/daemonset.yaml | 86 +++ .../kustomize.template/kustomization.yaml | 14 + .../kustomize.template/master.yaml | 49 ++ .../kustomize.template/namespace.yaml | 8 + .../kustomize.template/rbac.yaml | 55 ++ .../nvidia-device-plugin/README.md | 98 +++ .../nvidia-device-plugin/install.sh | 51 ++ .../kustomize.template/daemonset.yaml | 91 +++ .../kustomize.template/kustomization.yaml | 12 + .../kustomize.template/runtimeclass.yaml | 5 + 13 files changed, 1220 insertions(+) create mode 100644 setup/cluster-services/node-feature-discovery/README.md create mode 100755 setup/cluster-services/node-feature-discovery/install.sh create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml create mode 100644 setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml create mode 100644 setup/cluster-services/nvidia-device-plugin/README.md create mode 100755 setup/cluster-services/nvidia-device-plugin/install.sh create mode 100644 setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml create mode 100644 setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml create mode 100644 setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml diff --git a/setup/cluster-services/node-feature-discovery/README.md b/setup/cluster-services/node-feature-discovery/README.md new file mode 100644 index 0000000..e69de29 diff --git a/setup/cluster-services/node-feature-discovery/install.sh b/setup/cluster-services/node-feature-discovery/install.sh new file mode 100755 index 0000000..bc57839 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/install.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e +set -o pipefail + +# Initialize Wild Cloud environment +if [ -z "${WC_ROOT}" ]; then + print "WC_ROOT is not set." + exit 1 +else + source "${WC_ROOT}/scripts/common.sh" + init_wild_env +fi + +CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services" +NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery" + +print_header "Setting up Node Feature Discovery" + +# Templates should already be compiled by wild-cluster-services-generate +info "Using pre-compiled Node Feature Discovery templates..." +if [ ! -d "${NFD_DIR}/kustomize" ]; then + error "Compiled templates not found. Run 'wild-cluster-services-configure node-feature-discovery' first." + exit 1 +fi + +info "Deploying Node Feature Discovery..." +kubectl apply -k "${NFD_DIR}/kustomize" + +info "Waiting for Node Feature Discovery DaemonSet to be ready..." +kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s + +success "Node Feature Discovery installed successfully" + +echo "" +echo "To verify the installation:" +echo " kubectl get pods -n node-feature-discovery" +echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io" +echo "" +echo "GPU nodes should now be labeled with GPU device information:" +echo " kubectl get nodes --show-labels | grep pci-10de" \ No newline at end of file diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml new file mode 100644 index 0000000..9f62da6 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml @@ -0,0 +1,711 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.3 + name: nodefeatures.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeature + listKind: NodeFeatureList + plural: nodefeatures + singular: nodefeature + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeature resource holds the features discovered for one node in the + cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of the NodeFeature, containing features discovered + for a node. + properties: + features: + description: Features is the full "raw" features data that has been + discovered. + properties: + attributes: + additionalProperties: + description: AttributeFeatureSet is a set of features having + string value. + properties: + elements: + additionalProperties: + type: string + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Attributes contains all the attribute-type features + of the node. + type: object + flags: + additionalProperties: + description: FlagFeatureSet is a set of simple features only + containing names without values. + properties: + elements: + additionalProperties: + description: |- + Nil is a dummy empty struct for protobuf compatibility. + NOTE: protobuf definitions have been removed but this is kept for API compatibility. + type: object + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Flags contains all the flag-type features of the + node. + type: object + instances: + additionalProperties: + description: InstanceFeatureSet is a set of features each of + which is an instance having multiple attributes. + properties: + elements: + description: Individual features of the feature set. + items: + description: InstanceFeature represents one instance of + a complex features, e.g. a device. + properties: + attributes: + additionalProperties: + type: string + description: Attributes of the instance feature. + type: object + required: + - attributes + type: object + type: array + required: + - elements + type: object + description: Instances contains all the instance-type features + of the node. + type: object + type: object + labels: + additionalProperties: + type: string + description: Labels is the set of node labels that are requested to + be created. + type: object + type: object + required: + - spec + type: object + served: true + storage: true +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.3 + name: nodefeaturegroups.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureGroup + listKind: NodeFeatureGroupList + plural: nodefeaturegroups + shortNames: + - nfg + singular: nodefeaturegroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeFeatureGroup resource holds Node pools by featureGroup + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + featureGroupRules: + description: List of rules to evaluate to determine nodes that belong + in this group. + items: + description: GroupRule defines a rule for nodegroup filtering. + properties: + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + required: + - name + type: object + type: array + required: + - featureGroupRules + type: object + status: + description: |- + Status of the NodeFeatureGroup after the most recent evaluation of the + specification. + properties: + nodes: + description: Nodes is a list of FeatureGroupNode in the cluster that + match the featureGroupRules + items: + properties: + name: + description: Name of the node. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.3 + name: nodefeaturerules.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureRule + listKind: NodeFeatureRuleList + plural: nodefeaturerules + shortNames: + - nfr + singular: nodefeaturerule + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeatureRule resource specifies a configuration for feature-based + customization of node objects, such as node labeling. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + rules: + description: Rules is a list of node customization rules. + items: + description: Rule defines a rule for node customization such as + labeling. + properties: + annotations: + additionalProperties: + type: string + description: Annotations to create if the rule matches. + type: object + extendedResources: + additionalProperties: + type: string + description: ExtendedResources to create if the rule matches. + type: object + labels: + additionalProperties: + type: string + description: Labels to create if the rule matches. + type: object + labelsTemplate: + description: |- + LabelsTemplate specifies a template to expand for dynamically generating + multiple labels. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + taints: + description: Taints to create if the rule matches. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to + a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key + type: object + type: array + vars: + additionalProperties: + type: string + description: |- + Vars is the variables to store if the rule matches. Variables do not + directly inflict any changes in the node object. However, they can be + referenced from other rules enabling more complex rule hierarchies, + without exposing intermediary output values as labels. + type: object + varsTemplate: + description: |- + VarsTemplate specifies a template to expand for dynamically generating + multiple variables. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + required: + - name + type: object + type: array + required: + - rules + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml new file mode 100644 index 0000000..3810914 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-feature-discovery-worker + namespace: node-feature-discovery +spec: + selector: + matchLabels: + name: node-feature-discovery-worker + template: + metadata: + labels: + name: node-feature-discovery-worker + spec: + serviceAccountName: node-feature-discovery + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: worker + image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + runAsNonRoot: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + limits: + memory: 512Mi + requests: + cpu: 5m + memory: 64Mi + command: + - "nfd-worker" + args: + - "-metrics=8081" + - "-grpc-health=8082" + ports: + - containerPort: 8081 + name: metrics + - containerPort: 8082 + name: health + volumeMounts: + - name: host-boot + mountPath: "/host-boot" + readOnly: true + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + - name: host-sys + mountPath: "/host-sys" + readOnly: true + - name: host-usr-lib + mountPath: "/host-usr/lib" + readOnly: true + - name: host-lib + mountPath: "/host-lib" + readOnly: true + - name: host-proc-swaps + mountPath: "/host-proc/swaps" + readOnly: true + volumes: + - name: host-boot + hostPath: + path: "/boot" + - name: host-os-release + hostPath: + path: "/etc/os-release" + - name: host-sys + hostPath: + path: "/sys" + - name: host-usr-lib + hostPath: + path: "/usr/lib" + - name: host-lib + hostPath: + path: "/lib" + - name: host-proc-swaps + hostPath: + path: "/proc/swaps" \ No newline at end of file diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml new file mode 100644 index 0000000..7d32cf0 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: node-feature-discovery +labels: + - pairs: + app.kubernetes.io/name: node-feature-discovery + managedBy: kustomize + partOf: wild-cloud +resources: + - namespace.yaml + - crds.yaml + - rbac.yaml + - daemonset.yaml + - master.yaml \ No newline at end of file diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml new file mode 100644 index 0000000..c2fff33 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: node-feature-discovery-master + namespace: node-feature-discovery +spec: + replicas: 1 + selector: + matchLabels: + name: node-feature-discovery-master + template: + metadata: + labels: + name: node-feature-discovery-master + spec: + serviceAccountName: node-feature-discovery + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: master + image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + runAsNonRoot: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - "nfd-master" + args: + - "-metrics=8081" + - "-grpc-health=8082" + ports: + - containerPort: 8081 + name: metrics + - containerPort: 8082 + name: health + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 128Mi \ No newline at end of file diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml new file mode 100644 index 0000000..75867e7 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: node-feature-discovery + labels: + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged \ No newline at end of file diff --git a/setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml b/setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml new file mode 100644 index 0000000..12359d6 --- /dev/null +++ b/setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml @@ -0,0 +1,55 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-feature-discovery + namespace: node-feature-discovery +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-feature-discovery +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + - nodefeaturerules + - nodefeaturegroups + verbs: + - get + - list + - watch + - create + - update + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-feature-discovery +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-feature-discovery +subjects: +- kind: ServiceAccount + name: node-feature-discovery + namespace: node-feature-discovery \ No newline at end of file diff --git a/setup/cluster-services/nvidia-device-plugin/README.md b/setup/cluster-services/nvidia-device-plugin/README.md new file mode 100644 index 0000000..fe05b13 --- /dev/null +++ b/setup/cluster-services/nvidia-device-plugin/README.md @@ -0,0 +1,98 @@ +# NVIDIA Device Plugin + +The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs. + +## Overview + +This service deploys the official NVIDIA Device Plugin as a DaemonSet that: +- Discovers NVIDIA GPUs on worker nodes +- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`) +- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler +- Enables pods to request GPU resources + +## Prerequisites + +Before installing the NVIDIA Device Plugin, ensure that: + +1. **NVIDIA Drivers** are installed (>= 384.81) +2. **nvidia-container-toolkit** is installed (>= 1.7.0) +3. **nvidia-container-runtime** is configured as the default container runtime +4. Worker nodes have NVIDIA GPUs + +### Talos Linux Requirements + +For Talos Linux nodes, you need: +- NVIDIA drivers extension in the Talos schematic +- nvidia-container-toolkit extension +- Proper container runtime configuration + +## Installation + +```bash +# Configure and install the service +wild-cluster-services-configure nvidia-device-plugin +wild-cluster-install nvidia-device-plugin +``` + +## Verification + +After installation, verify the plugin is working: + +```bash +# Check plugin pods are running +kubectl get pods -n kube-system | grep nvidia + +# Verify GPU resources are advertised +kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))' + +# Check GPU node labels +kubectl get nodes --show-labels | grep nvidia +``` + +## Usage in Applications + +Once installed, applications can request GPU resources: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-app +spec: + template: + spec: + containers: + - name: app + image: nvidia/cuda:latest + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 +``` + +## Troubleshooting + +### Plugin Not Starting +- Verify NVIDIA drivers are installed on worker nodes +- Check that nvidia-container-toolkit is properly configured +- Ensure worker nodes are not tainted in a way that prevents scheduling + +### No GPU Resources Advertised +- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds` +- Verify NVIDIA runtime is the default container runtime +- Ensure GPUs are detected by the driver: check node logs for GPU detection messages + +## Configuration + +The plugin uses the following configuration: +- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1` +- **Namespace**: `kube-system` +- **Priority Class**: `system-node-critical` +- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint + +## References + +- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin) +- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) +- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/) \ No newline at end of file diff --git a/setup/cluster-services/nvidia-device-plugin/install.sh b/setup/cluster-services/nvidia-device-plugin/install.sh new file mode 100755 index 0000000..7cdc16e --- /dev/null +++ b/setup/cluster-services/nvidia-device-plugin/install.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e +set -o pipefail + +# Initialize Wild Cloud environment +if [ -z "${WC_ROOT}" ]; then + print "WC_ROOT is not set." + exit 1 +else + source "${WC_ROOT}/scripts/common.sh" + init_wild_env +fi + +CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services" +NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin" + +print_header "Setting up NVIDIA Device Plugin" + +# Check if we have NVIDIA GPUs in the cluster +print_info "Checking for NVIDIA GPUs in the cluster..." + +# Check if any worker nodes exist (device plugin only runs on worker nodes) +WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l) +if [ "$WORKER_NODES" -eq 0 ]; then + print_error "No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes." + exit 1 +fi + +print_info "Found $WORKER_NODES worker node(s)" + +# Templates should already be compiled by wild-cluster-services-generate +echo "Using pre-compiled NVIDIA Device Plugin templates..." +if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then + echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first." + exit 1 +fi + +print_info "Deploying NVIDIA Device Plugin..." +kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize + +print_info "Waiting for NVIDIA Device Plugin DaemonSet to be ready..." +kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s + +print_success "NVIDIA Device Plugin installed successfully" +echo "" +echo "To verify the installation:" +echo " kubectl get pods -n kube-system | grep nvidia" +echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'" +echo "" +echo "GPU nodes should now be labeled with GPU product information:" +echo " kubectl get nodes --show-labels | grep nvidia" \ No newline at end of file diff --git a/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml b/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml new file mode 100644 index 0000000..e4b2ec5 --- /dev/null +++ b/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml @@ -0,0 +1,91 @@ +# NVIDIA Device Plugin DaemonSet +# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml +# Licensed under the Apache License, Version 2.0 + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system + labels: + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/component: device-plugin + managedBy: kustomize + partOf: wild-cloud +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/component: device-plugin + spec: + runtimeClassName: nvidia + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: In + values: + - "true" + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1 + name: nvidia-device-plugin-ctr + env: + - name: MPS_ROOT + value: /run/nvidia/mps + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + - name: FAIL_ON_INIT_ERROR + value: "false" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: mps-shm + mountPath: /dev/shm + - name: mps-root + mountPath: /mps + - name: cdi-root + mountPath: /var/run/cdi + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: mps-root + hostPath: + path: /run/nvidia/mps + type: DirectoryOrCreate + - name: mps-shm + hostPath: + path: /run/nvidia/mps/shm + - name: cdi-root + hostPath: + path: /var/run/cdi + type: DirectoryOrCreate \ No newline at end of file diff --git a/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml b/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml new file mode 100644 index 0000000..c402fd6 --- /dev/null +++ b/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kube-system +resources: + - daemonset.yaml + - runtimeclass.yaml +labels: + - pairs: + app.kubernetes.io/name: nvidia-device-plugin + app.kubernetes.io/component: device-plugin + managedBy: kustomize + partOf: wild-cloud \ No newline at end of file diff --git a/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml b/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml new file mode 100644 index 0000000..05d51eb --- /dev/null +++ b/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml @@ -0,0 +1,5 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia \ No newline at end of file