75 lines
2.2 KiB
YAML
75 lines
2.2 KiB
YAML
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: vllm
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
component: inference
|
|
template:
|
|
metadata:
|
|
labels:
|
|
component: inference
|
|
spec:
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
runAsGroup: 1000
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
nodeSelector:
|
|
nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
|
|
containers:
|
|
- name: vllm
|
|
image: "{{ .apps.vllm.image }}"
|
|
imagePullPolicy: IfNotPresent
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop:
|
|
- ALL
|
|
readOnlyRootFilesystem: false
|
|
args:
|
|
- --model={{ .apps.vllm.model }}
|
|
- --max-model-len={{ .apps.vllm.maxModelLen }}
|
|
- --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
|
|
- --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
|
|
{{- if .apps.vllm.enforceEager }}
|
|
- --enforce-eager=True
|
|
{{- end }}
|
|
env:
|
|
- name: VLLM_TORCH_DTYPE
|
|
value: "auto"
|
|
- name: VLLM_WORKER_CONCURRENCY
|
|
value: "1"
|
|
ports:
|
|
- name: http
|
|
containerPort: 8000
|
|
resources:
|
|
requests:
|
|
cpu: "{{ .apps.vllm.cpuRequest }}"
|
|
memory: "{{ .apps.vllm.memoryRequest }}"
|
|
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
|
|
limits:
|
|
cpu: "{{ .apps.vllm.cpuLimit }}"
|
|
memory: "{{ .apps.vllm.memoryLimit }}"
|
|
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /v1/models
|
|
port: http
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 10
|
|
timeoutSeconds: 5
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: http
|
|
initialDelaySeconds: 60
|
|
periodSeconds: 15
|
|
timeoutSeconds: 5
|
|
tolerations:
|
|
- key: "nvidia.com/gpu"
|
|
operator: "Exists"
|
|
effect: "NoSchedule" |