Adds VLLM app.
This commit is contained in:
75
apps/vllm/deployment.yaml
Normal file
75
apps/vllm/deployment.yaml
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: vllm
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
component: inference
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
component: inference
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: "{{ .apps.vllm.gpuProduct }}"
|
||||||
|
containers:
|
||||||
|
- name: vllm
|
||||||
|
image: "{{ .apps.vllm.image }}"
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: false
|
||||||
|
args:
|
||||||
|
- --model={{ .apps.vllm.model }}
|
||||||
|
- --max-model-len={{ .apps.vllm.maxModelLen }}
|
||||||
|
- --tensor-parallel-size={{ .apps.vllm.tensorParallelSize }}
|
||||||
|
- --gpu-memory-utilization={{ .apps.vllm.gpuMemoryUtilization }}
|
||||||
|
{{- if .apps.vllm.enforceEager }}
|
||||||
|
- --enforce-eager=True
|
||||||
|
{{- end }}
|
||||||
|
env:
|
||||||
|
- name: VLLM_TORCH_DTYPE
|
||||||
|
value: "auto"
|
||||||
|
- name: VLLM_WORKER_CONCURRENCY
|
||||||
|
value: "1"
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 8000
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "{{ .apps.vllm.cpuRequest }}"
|
||||||
|
memory: "{{ .apps.vllm.memoryRequest }}"
|
||||||
|
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
|
||||||
|
limits:
|
||||||
|
cpu: "{{ .apps.vllm.cpuLimit }}"
|
||||||
|
memory: "{{ .apps.vllm.memoryLimit }}"
|
||||||
|
nvidia.com/gpu: {{ .apps.vllm.gpuCount }}
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/models
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 5
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 60
|
||||||
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 5
|
||||||
|
tolerations:
|
||||||
|
- key: "nvidia.com/gpu"
|
||||||
|
operator: "Exists"
|
||||||
|
effect: "NoSchedule"
|
||||||
25
apps/vllm/ingress.yaml
Normal file
25
apps/vllm/ingress.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: vllm
|
||||||
|
annotations:
|
||||||
|
external-dns.alpha.kubernetes.io/target: {{ .cloud.domain }}
|
||||||
|
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
|
||||||
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
|
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- host: {{ .apps.vllm.domain }}
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: vllm
|
||||||
|
port:
|
||||||
|
number: 8000
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- {{ .apps.vllm.domain }}
|
||||||
|
secretName: vllm-tls
|
||||||
14
apps/vllm/kustomization.yaml
Normal file
14
apps/vllm/kustomization.yaml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: {{ .apps.vllm.namespace }}
|
||||||
|
labels:
|
||||||
|
- includeSelectors: true
|
||||||
|
pairs:
|
||||||
|
app: vllm
|
||||||
|
managedBy: kustomize
|
||||||
|
partOf: wild-cloud
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- deployment.yaml
|
||||||
|
- service.yaml
|
||||||
|
- ingress.yaml
|
||||||
21
apps/vllm/manifest.yaml
Normal file
21
apps/vllm/manifest.yaml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name: vllm
|
||||||
|
description: vLLM is a fast and easy-to-use library for LLM inference and serving with OpenAI-compatible API
|
||||||
|
version: 0.5.4
|
||||||
|
icon: https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png
|
||||||
|
requires: []
|
||||||
|
defaultConfig:
|
||||||
|
image: vllm/vllm-openai:v0.5.4
|
||||||
|
model: Qwen/Qwen2.5-7B-Instruct
|
||||||
|
maxModelLen: 8192
|
||||||
|
tensorParallelSize: 1
|
||||||
|
gpuMemoryUtilization: 0.90
|
||||||
|
enforceEager: true
|
||||||
|
gpuProduct: "RTX 4090"
|
||||||
|
cpuRequest: "4"
|
||||||
|
cpuLimit: "8"
|
||||||
|
memoryRequest: "16Gi"
|
||||||
|
memoryLimit: "24Gi"
|
||||||
|
gpuCount: 1
|
||||||
|
domain: vllm.{{ .cloud.domain }}
|
||||||
|
namespace: llm
|
||||||
|
requiredSecrets: []
|
||||||
4
apps/vllm/namespace.yaml
Normal file
4
apps/vllm/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: {{ .apps.vllm.namespace }}
|
||||||
12
apps/vllm/service.yaml
Normal file
12
apps/vllm/service.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: vllm
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
component: inference
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 8000
|
||||||
|
targetPort: http
|
||||||
Reference in New Issue
Block a user