From 0dc8696820a0fa8f11c978c0b0d34d1a55e24319 Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Wed, 24 Sep 2025 04:33:57 -0700 Subject: [PATCH] Adds VLLM app. --- apps/vllm/deployment.yaml | 75 ++++++++++++++++++++++++++++++++++++ apps/vllm/ingress.yaml | 25 ++++++++++++ apps/vllm/kustomization.yaml | 14 +++++++ apps/vllm/manifest.yaml | 21 ++++++++++ apps/vllm/namespace.yaml | 4 ++ apps/vllm/service.yaml | 12 ++++++ 6 files changed, 151 insertions(+) create mode 100644 apps/vllm/deployment.yaml create mode 100644 apps/vllm/ingress.yaml create mode 100644 apps/vllm/kustomization.yaml create mode 100644 apps/vllm/manifest.yaml create mode 100644 apps/vllm/namespace.yaml create mode 100644 apps/vllm/service.yaml diff --git a/apps/vllm/deployment.yaml b/apps/vllm/deployment.yaml new file mode 100644 index 0000000..b366638 --- /dev/null +++ b/apps/vllm/deployment.yaml @@ -0,0 +1,75 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm +spec: + replicas: 1 + selector: + matchLabels: + component: inference + template: + metadata: + labels: + component: inference + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + seccompProfile: + type: RuntimeDefault + nodeSelector: + nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}" + containers: + - name: vllm + image: "{{ .apps.vllm.image }}" + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + args: + - --model={{ .apps.vllm.model }} + - --max-model-len={{ .apps.vllm.maxModelLen }} + - --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }} + - --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }} + {{- if .apps.vllm.enforceEager }} + - --enforce-eager=True + {{- end }} + env: + - name: VLLM_TORCH_DTYPE + value: "auto" + - name: VLLM_WORKER_CONCURRENCY + value: "1" + ports: + - name: http + containerPort: 8000 + resources: + requests: + cpu: "{{ .apps.vllm.cpuRequest }}" + memory: "{{ .apps.vllm.memoryRequest }}" + nvidia.com/gpu: {{ .apps.vllm.gpuCount }} + limits: + cpu: "{{ .apps.vllm.cpuLimit }}" + memory: "{{ .apps.vllm.memoryLimit }}" + nvidia.com/gpu: {{ .apps.vllm.gpuCount }} + readinessProbe: + httpGet: + path: /v1/models + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 60 + periodSeconds: 15 + timeoutSeconds: 5 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" \ No newline at end of file diff --git a/apps/vllm/ingress.yaml b/apps/vllm/ingress.yaml new file mode 100644 index 0000000..5135d2d --- /dev/null +++ b/apps/vllm/ingress.yaml @@ -0,0 +1,25 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vllm + annotations: + external-dns.alpha.kubernetes.io/target: {{ .cloud.domain }} + external-dns.alpha.kubernetes.io/cloudflare-proxied: "false" + traefik.ingress.kubernetes.io/router.tls: "true" + traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt +spec: + rules: + - host: {{ .apps.vllm.domain }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: vllm + port: + number: 8000 + tls: + - hosts: + - {{ .apps.vllm.domain }} + secretName: vllm-tls \ No newline at end of file diff --git a/apps/vllm/kustomization.yaml b/apps/vllm/kustomization.yaml new file mode 100644 index 0000000..6f45422 --- /dev/null +++ b/apps/vllm/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: {{ .apps.vllm.namespace }} +labels: + - includeSelectors: true + pairs: + app: vllm + managedBy: kustomize + partOf: wild-cloud +resources: + - namespace.yaml + - deployment.yaml + - service.yaml + - ingress.yaml \ No newline at end of file diff --git a/apps/vllm/manifest.yaml b/apps/vllm/manifest.yaml new file mode 100644 index 0000000..1ab8ffd --- /dev/null +++ b/apps/vllm/manifest.yaml @@ -0,0 +1,21 @@ +name: vllm +description: vLLM is a fast and easy-to-use library for LLM inference and serving with OpenAI-compatible API +version: 0.5.4 +icon: https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png +requires: [] +defaultConfig: + image: vllm/vllm-openai:v0.5.4 + model: Qwen/Qwen2.5-7B-Instruct + maxModelLen: 8192 + tensorParallelSize: 1 + gpuMemoryUtilization: 0.90 + enforceEager: true + gpuProduct: "RTX 4090" + cpuRequest: "4" + cpuLimit: "8" + memoryRequest: "16Gi" + memoryLimit: "24Gi" + gpuCount: 1 + domain: vllm.{{ .cloud.domain }} + namespace: llm +requiredSecrets: [] \ No newline at end of file diff --git a/apps/vllm/namespace.yaml b/apps/vllm/namespace.yaml new file mode 100644 index 0000000..8fc7d02 --- /dev/null +++ b/apps/vllm/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: {{ .apps.vllm.namespace }} \ No newline at end of file diff --git a/apps/vllm/service.yaml b/apps/vllm/service.yaml new file mode 100644 index 0000000..249b0cf --- /dev/null +++ b/apps/vllm/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm +spec: + type: ClusterIP + selector: + component: inference + ports: + - name: http + port: 8000 + targetPort: http \ No newline at end of file