Adds VLLM app.

This commit is contained in:
2025-09-24 04:33:57 -07:00
parent 80b9d14ec4
commit 0dc8696820
6 changed files with 151 additions and 0 deletions

75
apps/vllm/deployment.yaml Normal file
View File

@@ -0,0 +1,75 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
spec:
replicas: 1
selector:
matchLabels:
component: inference
template:
metadata:
labels:
component: inference
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
seccompProfile:
type: RuntimeDefault
nodeSelector:
nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
containers:
- name: vllm
image: "{{ .apps.vllm.image }}"
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
args:
- --model={{ .apps.vllm.model }}
- --max-model-len={{ .apps.vllm.maxModelLen }}
- --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
- --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
{{- if .apps.vllm.enforceEager }}
- --enforce-eager=True
{{- end }}
env:
- name: VLLM_TORCH_DTYPE
value: "auto"
- name: VLLM_WORKER_CONCURRENCY
value: "1"
ports:
- name: http
containerPort: 8000
resources:
requests:
cpu: "{{ .apps.vllm.cpuRequest }}"
memory: "{{ .apps.vllm.memoryRequest }}"
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
limits:
cpu: "{{ .apps.vllm.cpuLimit }}"
memory: "{{ .apps.vllm.memoryLimit }}"
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
readinessProbe:
httpGet:
path: /v1/models
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 60
periodSeconds: 15
timeoutSeconds: 5
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

25
apps/vllm/ingress.yaml Normal file
View File

@@ -0,0 +1,25 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: vllm
annotations:
external-dns.alpha.kubernetes.io/target: {{ .cloud.domain }}
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
spec:
rules:
- host: {{ .apps.vllm.domain }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: vllm
port:
number: 8000
tls:
- hosts:
- {{ .apps.vllm.domain }}
secretName: vllm-tls

View File

@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: {{ .apps.vllm.namespace }}
labels:
- includeSelectors: true
pairs:
app: vllm
managedBy: kustomize
partOf: wild-cloud
resources:
- namespace.yaml
- deployment.yaml
- service.yaml
- ingress.yaml

21
apps/vllm/manifest.yaml Normal file
View File

@@ -0,0 +1,21 @@
name: vllm
description: vLLM is a fast and easy-to-use library for LLM inference and serving with OpenAI-compatible API
version: 0.5.4
icon: https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png
requires: []
defaultConfig:
image: vllm/vllm-openai:v0.5.4
model: Qwen/Qwen2.5-7B-Instruct
maxModelLen: 8192
tensorParallelSize: 1
gpuMemoryUtilization: 0.90
enforceEager: true
gpuProduct: "RTX 4090"
cpuRequest: "4"
cpuLimit: "8"
memoryRequest: "16Gi"
memoryLimit: "24Gi"
gpuCount: 1
domain: vllm.{{ .cloud.domain }}
namespace: llm
requiredSecrets: []

4
apps/vllm/namespace.yaml Normal file
View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: {{ .apps.vllm.namespace }}

12
apps/vllm/service.yaml Normal file
View File

@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: vllm
spec:
type: ClusterIP
selector:
component: inference
ports:
- name: http
port: 8000
targetPort: http