Table of Contents

# vLLM

running from docker

docker run --gpus all -p 8000:8000 \
  -v /mnt/d/_workarea/models:/root/.cache/huggingface/hub \
  --rm \
  vllm/vllm-openai:latest \
  --dtype auto \
  --gpu-memory-utilization 0.8 \
  --max-model-len 8192
  --model Qwen/Qwen2.5-7B-Instruct-AWQ

test for the vllm

# Call the server using curl:
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Qwen/Qwen2.5-7B-Instruct-AWQ",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'