Files
wild-directory/vllm/deployment.yaml
2025-10-11 17:25:46 +00:00

75 lines
2.2 KiB
YAML

apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
spec:
replicas: 1
selector:
matchLabels:
component: inference
template:
metadata:
labels:
component: inference
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
seccompProfile:
type: RuntimeDefault
nodeSelector:
nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
containers:
- name: vllm
image: "{{ .apps.vllm.image }}"
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
args:
- --model={{ .apps.vllm.model }}
- --max-model-len={{ .apps.vllm.maxModelLen }}
- --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
- --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
{{- if .apps.vllm.enforceEager }}
- --enforce-eager=True
{{- end }}
env:
- name: VLLM_TORCH_DTYPE
value: "auto"
- name: VLLM_WORKER_CONCURRENCY
value: "1"
ports:
- name: http
containerPort: 8000
resources:
requests:
cpu: "{{ .apps.vllm.cpuRequest }}"
memory: "{{ .apps.vllm.memoryRequest }}"
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
limits:
cpu: "{{ .apps.vllm.cpuLimit }}"
memory: "{{ .apps.vllm.memoryLimit }}"
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
readinessProbe:
httpGet:
path: /v1/models
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 60
periodSeconds: 15
timeoutSeconds: 5
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"