apiVersion: apps/v1 kind: Deployment metadata: name: vllm spec: replicas: 1 selector: matchLabels: component: inference template: metadata: labels: component: inference spec: securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 1000 seccompProfile: type: RuntimeDefault nodeSelector: nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}" containers: - name: vllm image: "{{ .apps.vllm.image }}" imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: false args: - --model={{ .apps.vllm.model }} - --max-model-len={{ .apps.vllm.maxModelLen }} - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }} - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }} {{- if .apps.vllm.enforceEager }} - --enforce-eager=True {{- end }} env: - name: VLLM_TORCH_DTYPE value: "auto" - name: VLLM_WORKER_CONCURRENCY value: "1" ports: - name: http containerPort: 8000 resources: requests: cpu: "{{ .apps.vllm.cpuRequest }}" memory: "{{ .apps.vllm.memoryRequest }}" nvidia.com/gpu: {{ .apps.vllm.gpuCount }} limits: cpu: "{{ .apps.vllm.cpuLimit }}" memory: "{{ .apps.vllm.memoryLimit }}" nvidia.com/gpu: {{ .apps.vllm.gpuCount }} readinessProbe: httpGet: path: /v1/models port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 livenessProbe: httpGet: path: /health port: http initialDelaySeconds: 60 periodSeconds: 15 timeoutSeconds: 5 tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule"