Adds VLLM app.

2025-09-24 04:33:57 -07:00
parent 80b9d14ec4
commit 0dc8696820
6 changed files with 151 additions and 0 deletions
--- a/apps/vllm/deployment.yaml
+++ b/apps/vllm/deployment.yaml
@@ -0,0 +1,75 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      component: inference
+  template:
+    metadata:
+      labels:
+        component: inference
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        runAsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      nodeSelector:
+        nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
+      containers:
+        - name: vllm
+          image: "{{ .apps.vllm.image }}"
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+          args:
+            - --model={{ .apps.vllm.model }}
+            - --max-model-len={{ .apps.vllm.maxModelLen }}
+            - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
+            - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
+            {{- if .apps.vllm.enforceEager }}
+            - --enforce-eager=True
+            {{- end }}
+          env:
+            - name: VLLM_TORCH_DTYPE
+              value: "auto"
+            - name: VLLM_WORKER_CONCURRENCY
+              value: "1"
+          ports:
+            - name: http
+              containerPort: 8000
+          resources:
+            requests:
+              cpu: "{{ .apps.vllm.cpuRequest }}"
+              memory: "{{ .apps.vllm.memoryRequest }}"
+              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
+            limits:
+              cpu: "{{ .apps.vllm.cpuLimit }}"
+              memory: "{{ .apps.vllm.memoryLimit }}"
+              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 15
+            timeoutSeconds: 5
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"