Reorganized for new stable/waypoint versioning design.

2026-05-24 18:28:47 +00:00
parent 945d2225a2
commit bc7a168851
352 changed files with 1264 additions and 294 deletions
--- a/vllm/versions/0/README.md
+++ b/vllm/versions/0/README.md
@@ -0,0 +1,29 @@
+# vLLM
+
+vLLM is a fast and easy-to-use library for LLM inference and serving with an OpenAI-compatible API. Use it to run large language models on your own hardware.
+
+## Dependencies
+
+None, but requires a GPU node in your cluster.
+
+## Configuration
+
+Key settings configured through your instance's `config.yaml`:
+
+- **model** - Hugging Face model to serve (default: `Qwen/Qwen2.5-7B-Instruct`)
+- **maxModelLen** - Maximum sequence length (default: `8192`)
+- **gpuProduct** - Required GPU type (default: `RTX 4090`)
+- **gpuCount** - Number of GPUs to use (default: `1`)
+- **gpuMemoryUtilization** - Fraction of GPU memory to use (default: `0.9`)
+- **domain** - Where the API will be accessible (default: `vllm.{your-cloud-domain}`)
+
+## Access
+
+After deployment, the OpenAI-compatible API will be available at:
+- `https://vllm.{your-cloud-domain}/v1`
+
+Other apps on the cluster (such as Open WebUI) can connect internally at `http://vllm-service.llm.svc.cluster.local:8000/v1`.
+
+## Hardware Requirements
+
+This app requires a GPU node in your cluster. Adjust the `gpuProduct`, `gpuCount`, and memory settings to match your available hardware.
--- a/vllm/versions/0/deployment.yaml
+++ b/vllm/versions/0/deployment.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      component: inference
+  template:
+    metadata:
+      labels:
+        component: inference
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        runAsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      nodeSelector:
+        nvidia.com/gpu.product: "{{ .gpuProduct }}"
+      containers:
+        - name: vllm
+          image: vllm/vllm-openai:v0.5.4
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+          args:
+            - --model={{ .model }}
+            - --max-model-len={{ .maxModelLen }}
+            - --tensor-parallel-size=1
+            - --gpu-memory-utilization={{ .gpuMemoryUtilization }}
+            - --enforce-eager=True
+          env:
+            - name: VLLM_TORCH_DTYPE
+              value: "auto"
+            - name: VLLM_WORKER_CONCURRENCY
+              value: "1"
+          ports:
+            - name: http
+              containerPort: 8000
+          resources:
+            requests:
+              cpu: "{{ .cpuRequest }}"
+              memory: "{{ .memoryRequest }}"
+              nvidia.com/gpu: {{ .gpuCount }}
+            limits:
+              cpu: "{{ .cpuLimit }}"
+              memory: "{{ .memoryLimit }}"
+              nvidia.com/gpu: {{ .gpuCount }}
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 15
+            timeoutSeconds: 5
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
--- a/vllm/versions/0/ingress.yaml
+++ b/vllm/versions/0/ingress.yaml
@@ -0,0 +1,25 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vllm
+  annotations:
+    external-dns.alpha.kubernetes.io/target: {{ .externalDnsDomain }}
+    external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
+spec:
+  rules:
+    - host: {{ .domain }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: vllm
+                port:
+                  number: 8000
+  tls:
+    - hosts:
+        - {{ .domain }}
+      secretName: vllm-tls
--- a/vllm/versions/0/kustomization.yaml
+++ b/vllm/versions/0/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: {{ .namespace }}
+labels:
+  - includeSelectors: true
+    pairs:
+      app: vllm
+      managedBy: kustomize
+      partOf: wild-cloud
+resources:
+  - namespace.yaml
+  - deployment.yaml
+  - service.yaml
+  - ingress.yaml
--- a/vllm/versions/0/manifest.yaml
+++ b/vllm/versions/0/manifest.yaml
@@ -0,0 +1,15 @@
+version: 0.5.4-1
+requires: []
+defaultConfig:
+  namespace: llm
+  model: Qwen/Qwen2.5-7B-Instruct
+  maxModelLen: 8192
+  gpuMemoryUtilization: 0.9
+  gpuProduct: RTX 4090
+  cpuRequest: '4'
+  cpuLimit: '8'
+  memoryRequest: 16Gi
+  memoryLimit: 24Gi
+  gpuCount: 1
+  domain: vllm.{{ .cloud.domain }}
+defaultSecrets: []
--- a/vllm/versions/0/namespace.yaml
+++ b/vllm/versions/0/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {{ .namespace }}
--- a/vllm/versions/0/service.yaml
+++ b/vllm/versions/0/service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm
+spec:
+  type: ClusterIP
+  selector:
+    component: inference
+  ports:
+    - name: http
+      port: 8000
+      targetPort: http