Reorganized for new stable/waypoint versioning design.

This commit is contained in:
2026-05-24 18:28:47 +00:00
parent 945d2225a2
commit bc7a168851
352 changed files with 1264 additions and 294 deletions

29
vllm/versions/0/README.md Normal file
View File

@@ -0,0 +1,29 @@
# vLLM
vLLM is a fast and easy-to-use library for LLM inference and serving with an OpenAI-compatible API. Use it to run large language models on your own hardware.
## Dependencies
None, but requires a GPU node in your cluster.
## Configuration
Key settings configured through your instance's `config.yaml`:
- **model** - Hugging Face model to serve (default: `Qwen/Qwen2.5-7B-Instruct`)
- **maxModelLen** - Maximum sequence length (default: `8192`)
- **gpuProduct** - Required GPU type (default: `RTX 4090`)
- **gpuCount** - Number of GPUs to use (default: `1`)
- **gpuMemoryUtilization** - Fraction of GPU memory to use (default: `0.9`)
- **domain** - Where the API will be accessible (default: `vllm.{your-cloud-domain}`)
## Access
After deployment, the OpenAI-compatible API will be available at:
- `https://vllm.{your-cloud-domain}/v1`
Other apps on the cluster (such as Open WebUI) can connect internally at `http://vllm-service.llm.svc.cluster.local:8000/v1`.
## Hardware Requirements
This app requires a GPU node in your cluster. Adjust the `gpuProduct`, `gpuCount`, and memory settings to match your available hardware.

View File

@@ -0,0 +1,73 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
spec:
replicas: 1
selector:
matchLabels:
component: inference
template:
metadata:
labels:
component: inference
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
seccompProfile:
type: RuntimeDefault
nodeSelector:
nvidia.com/gpu.product: "{{ .gpuProduct }}"
containers:
- name: vllm
image: vllm/vllm-openai:v0.5.4
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
args:
- --model={{ .model }}
- --max-model-len={{ .maxModelLen }}
- --tensor-parallel-size=1
- --gpu-memory-utilization={{ .gpuMemoryUtilization }}
- --enforce-eager=True
env:
- name: VLLM_TORCH_DTYPE
value: "auto"
- name: VLLM_WORKER_CONCURRENCY
value: "1"
ports:
- name: http
containerPort: 8000
resources:
requests:
cpu: "{{ .cpuRequest }}"
memory: "{{ .memoryRequest }}"
nvidia.com/gpu: {{ .gpuCount }}
limits:
cpu: "{{ .cpuLimit }}"
memory: "{{ .memoryLimit }}"
nvidia.com/gpu: {{ .gpuCount }}
readinessProbe:
httpGet:
path: /v1/models
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 60
periodSeconds: 15
timeoutSeconds: 5
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

View File

@@ -0,0 +1,25 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: vllm
annotations:
external-dns.alpha.kubernetes.io/target: {{ .externalDnsDomain }}
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
spec:
rules:
- host: {{ .domain }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: vllm
port:
number: 8000
tls:
- hosts:
- {{ .domain }}
secretName: vllm-tls

View File

@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: {{ .namespace }}
labels:
- includeSelectors: true
pairs:
app: vllm
managedBy: kustomize
partOf: wild-cloud
resources:
- namespace.yaml
- deployment.yaml
- service.yaml
- ingress.yaml

View File

@@ -0,0 +1,15 @@
version: 0.5.4-1
requires: []
defaultConfig:
namespace: llm
model: Qwen/Qwen2.5-7B-Instruct
maxModelLen: 8192
gpuMemoryUtilization: 0.9
gpuProduct: RTX 4090
cpuRequest: '4'
cpuLimit: '8'
memoryRequest: 16Gi
memoryLimit: 24Gi
gpuCount: 1
domain: vllm.{{ .cloud.domain }}
defaultSecrets: []

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: {{ .namespace }}

View File

@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: vllm
spec:
type: ClusterIP
selector:
component: inference
ports:
- name: http
port: 8000
targetPort: http