Adds VLLM app.

2025-09-24 04:33:57 -07:00
parent 80b9d14ec4
commit 0dc8696820
6 changed files with 151 additions and 0 deletions
--- a/apps/vllm/deployment.yaml
+++ b/apps/vllm/deployment.yaml
@@ -0,0 +1,75 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      component: inference
+  template:
+    metadata:
+      labels:
+        component: inference
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        runAsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      nodeSelector:
+        nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
+      containers:
+        - name: vllm
+          image: "{{ .apps.vllm.image }}"
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+          args:
+            - --model={{ .apps.vllm.model }}
+            - --max-model-len={{ .apps.vllm.maxModelLen }}
+            - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
+            - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
+            {{- if .apps.vllm.enforceEager }}
+            - --enforce-eager=True
+            {{- end }}
+          env:
+            - name: VLLM_TORCH_DTYPE
+              value: "auto"
+            - name: VLLM_WORKER_CONCURRENCY
+              value: "1"
+          ports:
+            - name: http
+              containerPort: 8000
+          resources:
+            requests:
+              cpu: "{{ .apps.vllm.cpuRequest }}"
+              memory: "{{ .apps.vllm.memoryRequest }}"
+              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
+            limits:
+              cpu: "{{ .apps.vllm.cpuLimit }}"
+              memory: "{{ .apps.vllm.memoryLimit }}"
+              nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 15
+            timeoutSeconds: 5
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
--- a/apps/vllm/ingress.yaml
+++ b/apps/vllm/ingress.yaml
@@ -0,0 +1,25 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vllm
+  annotations:
+    external-dns.alpha.kubernetes.io/target: {{ .cloud.domain }}
+    external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
+spec:
+  rules:
+    - host: {{ .apps.vllm.domain }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: vllm
+                port:
+                  number: 8000
+  tls:
+    - hosts:
+        - {{ .apps.vllm.domain }}
+      secretName: vllm-tls
--- a/apps/vllm/kustomization.yaml
+++ b/apps/vllm/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: {{ .apps.vllm.namespace }}
+labels:
+  - includeSelectors: true
+    pairs:
+      app: vllm
+      managedBy: kustomize
+      partOf: wild-cloud
+resources:
+  - namespace.yaml
+  - deployment.yaml
+  - service.yaml
+  - ingress.yaml
--- a/apps/vllm/manifest.yaml
+++ b/apps/vllm/manifest.yaml
@@ -0,0 +1,21 @@
+name: vllm
+description: vLLM is a fast and easy-to-use library for LLM inference and serving with OpenAI-compatible API
+version: 0.5.4
+icon: https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png
+requires: []
+defaultConfig:
+  image: vllm/vllm-openai:v0.5.4
+  model: Qwen/Qwen2.5-7B-Instruct
+  maxModelLen: 8192
+  tensorParallelSize: 1
+  gpuMemoryUtilization: 0.90
+  enforceEager: true
+  gpuProduct: "RTX 4090"
+  cpuRequest: "4"
+  cpuLimit: "8"
+  memoryRequest: "16Gi"
+  memoryLimit: "24Gi"
+  gpuCount: 1
+  domain: vllm.{{ .cloud.domain }}
+  namespace: llm
+requiredSecrets: []
--- a/apps/vllm/namespace.yaml
+++ b/apps/vllm/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {{ .apps.vllm.namespace }}
--- a/apps/vllm/service.yaml
+++ b/apps/vllm/service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm
+spec:
+  type: ClusterIP
+  selector:
+    component: inference
+  ports:
+    - name: http
+      port: 8000
+      targetPort: http