apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm
spec:
  replicas: 1
  selector:
    matchLabels:
      component: inference
  template:
    metadata:
      labels:
        component: inference
    spec:
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        runAsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      nodeSelector:
        nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
      containers:
        - name: vllm
          image: "{{ .apps.vllm.image }}"
          imagePullPolicy: IfNotPresent
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop:
              - ALL
            readOnlyRootFilesystem: false
          args:
            - --model={{ .apps.vllm.model }}
            - --max-model-len={{ .apps.vllm.maxModelLen }}
            - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
            - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
            {{- if .apps.vllm.enforceEager }}
            - --enforce-eager=True
            {{- end }}
          env:
            - name: VLLM_TORCH_DTYPE
              value: "auto"
            - name: VLLM_WORKER_CONCURRENCY
              value: "1"
          ports:
            - name: http
              containerPort: 8000
          resources:
            requests:
              cpu: "{{ .apps.vllm.cpuRequest }}"
              memory: "{{ .apps.vllm.memoryRequest }}"
              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
            limits:
              cpu: "{{ .apps.vllm.cpuLimit }}"
              memory: "{{ .apps.vllm.memoryLimit }}"
              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
          readinessProbe:
            httpGet:
              path: /v1/models
              port: http
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
          livenessProbe:
            httpGet:
              path: /health
              port: http
            initialDelaySeconds: 60
            periodSeconds: 15
            timeoutSeconds: 5
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"