# vLLM
running from docker
docker run --gpus all -p 8000:8000 \
-v /mnt/d/_workarea/models:/root/.cache/huggingface/hub \
--rm \
vllm/vllm-openai:latest \
--dtype auto \
--gpu-memory-utilization 0.8 \
--max-model-len 8192
--model Qwen/Qwen2.5-7B-Instruct-AWQ
test for the vllm
# Call the server using curl:
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "Qwen/Qwen2.5-7B-Instruct-AWQ",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'