Reorganized for new stable/waypoint versioning design.
This commit is contained in:
29
vllm/versions/0/README.md
Normal file
29
vllm/versions/0/README.md
Normal file
@@ -0,0 +1,29 @@
|
||||
# vLLM
|
||||
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving with an OpenAI-compatible API. Use it to run large language models on your own hardware.
|
||||
|
||||
## Dependencies
|
||||
|
||||
None, but requires a GPU node in your cluster.
|
||||
|
||||
## Configuration
|
||||
|
||||
Key settings configured through your instance's `config.yaml`:
|
||||
|
||||
- **model** - Hugging Face model to serve (default: `Qwen/Qwen2.5-7B-Instruct`)
|
||||
- **maxModelLen** - Maximum sequence length (default: `8192`)
|
||||
- **gpuProduct** - Required GPU type (default: `RTX 4090`)
|
||||
- **gpuCount** - Number of GPUs to use (default: `1`)
|
||||
- **gpuMemoryUtilization** - Fraction of GPU memory to use (default: `0.9`)
|
||||
- **domain** - Where the API will be accessible (default: `vllm.{your-cloud-domain}`)
|
||||
|
||||
## Access
|
||||
|
||||
After deployment, the OpenAI-compatible API will be available at:
|
||||
- `https://vllm.{your-cloud-domain}/v1`
|
||||
|
||||
Other apps on the cluster (such as Open WebUI) can connect internally at `http://vllm-service.llm.svc.cluster.local:8000/v1`.
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
This app requires a GPU node in your cluster. Adjust the `gpuProduct`, `gpuCount`, and memory settings to match your available hardware.
|
||||
73
vllm/versions/0/deployment.yaml
Normal file
73
vllm/versions/0/deployment.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
component: inference
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
component: inference
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: "{{ .gpuProduct }}"
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:v0.5.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: false
|
||||
args:
|
||||
- --model={{ .model }}
|
||||
- --max-model-len={{ .maxModelLen }}
|
||||
- --tensor-parallel-size=1
|
||||
- --gpu-memory-utilization={{ .gpuMemoryUtilization }}
|
||||
- --enforce-eager=True
|
||||
env:
|
||||
- name: VLLM_TORCH_DTYPE
|
||||
value: "auto"
|
||||
- name: VLLM_WORKER_CONCURRENCY
|
||||
value: "1"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8000
|
||||
resources:
|
||||
requests:
|
||||
cpu: "{{ .cpuRequest }}"
|
||||
memory: "{{ .memoryRequest }}"
|
||||
nvidia.com/gpu: {{ .gpuCount }}
|
||||
limits:
|
||||
cpu: "{{ .cpuLimit }}"
|
||||
memory: "{{ .memoryLimit }}"
|
||||
nvidia.com/gpu: {{ .gpuCount }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v1/models
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 5
|
||||
tolerations:
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
25
vllm/versions/0/ingress.yaml
Normal file
25
vllm/versions/0/ingress.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: vllm
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/target: {{ .externalDnsDomain }}
|
||||
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
|
||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
|
||||
spec:
|
||||
rules:
|
||||
- host: {{ .domain }}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: vllm
|
||||
port:
|
||||
number: 8000
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ .domain }}
|
||||
secretName: vllm-tls
|
||||
14
vllm/versions/0/kustomization.yaml
Normal file
14
vllm/versions/0/kustomization.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: {{ .namespace }}
|
||||
labels:
|
||||
- includeSelectors: true
|
||||
pairs:
|
||||
app: vllm
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
- ingress.yaml
|
||||
15
vllm/versions/0/manifest.yaml
Normal file
15
vllm/versions/0/manifest.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
version: 0.5.4-1
|
||||
requires: []
|
||||
defaultConfig:
|
||||
namespace: llm
|
||||
model: Qwen/Qwen2.5-7B-Instruct
|
||||
maxModelLen: 8192
|
||||
gpuMemoryUtilization: 0.9
|
||||
gpuProduct: RTX 4090
|
||||
cpuRequest: '4'
|
||||
cpuLimit: '8'
|
||||
memoryRequest: 16Gi
|
||||
memoryLimit: 24Gi
|
||||
gpuCount: 1
|
||||
domain: vllm.{{ .cloud.domain }}
|
||||
defaultSecrets: []
|
||||
4
vllm/versions/0/namespace.yaml
Normal file
4
vllm/versions/0/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: {{ .namespace }}
|
||||
12
vllm/versions/0/service.yaml
Normal file
12
vllm/versions/0/service.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
component: inference
|
||||
ports:
|
||||
- name: http
|
||||
port: 8000
|
||||
targetPort: http
|
||||
Reference in New Issue
Block a user