Adds VLLM app.

2025-09-24 04:33:57 -07:00
parent 80b9d14ec4
commit 0dc8696820
6 changed files with 151 additions and 0 deletions
--- a/apps/vllm/deployment.yaml
+++ b/apps/vllm/deployment.yaml
@@ -0,0 +1,75 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: vllm
 spec:
  replicas: 1
  selector:
    matchLabels:
      component: inference
  template:
    metadata:
      labels:
        component: inference
    spec:
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        runAsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      nodeSelector:
        nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
      containers:
        - name: vllm
          image: "{{ .apps.vllm.image }}"
          imagePullPolicy: IfNotPresent
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop:
              - ALL
            readOnlyRootFilesystem: false
          args:
            - --model={{ .apps.vllm.model }}
            - --max-model-len={{ .apps.vllm.maxModelLen }}
            - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
            - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
            {{- if .apps.vllm.enforceEager }}
            - --enforce-eager=True
            {{- end }}
          env:
            - name: VLLM_TORCH_DTYPE
              value: "auto"
            - name: VLLM_WORKER_CONCURRENCY
              value: "1"
          ports:
            - name: http
              containerPort: 8000
          resources:
            requests:
              cpu: "{{ .apps.vllm.cpuRequest }}"
              memory: "{{ .apps.vllm.memoryRequest }}"
              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
            limits:
              cpu: "{{ .apps.vllm.cpuLimit }}"
              memory: "{{ .apps.vllm.memoryLimit }}"
              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
          readinessProbe:
            httpGet:
              path: /v1/models
              port: http
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
          livenessProbe:
            httpGet:
              path: /health
              port: http
            initialDelaySeconds: 60
            periodSeconds: 15
            timeoutSeconds: 5
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
--- a/apps/vllm/ingress.yaml
+++ b/apps/vllm/ingress.yaml
@@ -0,0 +1,25 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: vllm
  annotations:
    external-dns.alpha.kubernetes.io/target: {{ .cloud.domain }}
    external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
    traefik.ingress.kubernetes.io/router.tls: "true"
    traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
 spec:
  rules:
    - host: {{ .apps.vllm.domain }}
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: vllm
                port:
                  number: 8000
  tls:
    - hosts:
        - {{ .apps.vllm.domain }}
      secretName: vllm-tls
--- a/apps/vllm/kustomization.yaml
+++ b/apps/vllm/kustomization.yaml
@@ -0,0 +1,14 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: {{ .apps.vllm.namespace }}
 labels:
  - includeSelectors: true
    pairs:
      app: vllm
      managedBy: kustomize
      partOf: wild-cloud
 resources:
  - namespace.yaml
  - deployment.yaml
  - service.yaml
  - ingress.yaml
--- a/apps/vllm/manifest.yaml
+++ b/apps/vllm/manifest.yaml
@@ -0,0 +1,21 @@
 name: vllm
 description: vLLM is a fast and easy-to-use library for LLM inference and serving with OpenAI-compatible API
 version: 0.5.4
 icon: https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png
 requires: []
 defaultConfig:
  image: vllm/vllm-openai:v0.5.4
  model: Qwen/Qwen2.5-7B-Instruct
  maxModelLen: 8192
  tensorParallelSize: 1
  gpuMemoryUtilization: 0.90
  enforceEager: true
  gpuProduct: "RTX 4090"
  cpuRequest: "4"
  cpuLimit: "8"
  memoryRequest: "16Gi"
  memoryLimit: "24Gi"
  gpuCount: 1
  domain: vllm.{{ .cloud.domain }}
  namespace: llm
 requiredSecrets: []
--- a/apps/vllm/namespace.yaml
+++ b/apps/vllm/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: {{ .apps.vllm.namespace }}
--- a/apps/vllm/service.yaml
+++ b/apps/vllm/service.yaml
@@ -0,0 +1,12 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: vllm
 spec:
  type: ClusterIP
  selector:
    component: inference
  ports:
    - name: http
      port: 8000
      targetPort: http