Reorganized for new stable/waypoint versioning design.

2026-05-24 18:28:47 +00:00
parent 945d2225a2
commit bc7a168851
352 changed files with 1264 additions and 294 deletions
--- a/vllm/versions/0/deployment.yaml
+++ b/vllm/versions/0/deployment.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      component: inference
+  template:
+    metadata:
+      labels:
+        component: inference
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        runAsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      nodeSelector:
+        nvidia.com/gpu.product: "{{ .gpuProduct }}"
+      containers:
+        - name: vllm
+          image: vllm/vllm-openai:v0.5.4
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+          args:
+            - --model={{ .model }}
+            - --max-model-len={{ .maxModelLen }}
+            - --tensor-parallel-size=1
+            - --gpu-memory-utilization={{ .gpuMemoryUtilization }}
+            - --enforce-eager=True
+          env:
+            - name: VLLM_TORCH_DTYPE
+              value: "auto"
+            - name: VLLM_WORKER_CONCURRENCY
+              value: "1"
+          ports:
+            - name: http
+              containerPort: 8000
+          resources:
+            requests:
+              cpu: "{{ .cpuRequest }}"
+              memory: "{{ .memoryRequest }}"
+              nvidia.com/gpu: {{ .gpuCount }}
+            limits:
+              cpu: "{{ .cpuLimit }}"
+              memory: "{{ .memoryLimit }}"
+              nvidia.com/gpu: {{ .gpuCount }}
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 15
+            timeoutSeconds: 5
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"