Inference delays

I’m deploying a large language model (LLM) using FastAPI and vLLM, but I’m facing significant delays during the initial inference (up to 5 minutes). I have tried loading the model at startup using the lifespan context manager, but the performance issue persists. Are there effective strategies to reduce this initial response time?

from vllm import LLM, SamplingParams
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
from typing import List
import os
import torch

app = FastAPI()
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "" #Finetuned llama 3.1 70b
llm = LLM(model=model_name, dtype=torch.bfloat16, gpu_memory_utilization=0.8, max_model_len = 4096,  trust_remote_code=True, quantization="bitsandbytes", load_format="bitsandbytes")

SYSTEM_MESSAGE  = """
    Act as an ...
"""

class RequestData(BaseModel):
    prompts: List[str]
    max_tokens: int = 2048
    temperature: float = 0.7

@app.post("/predict")
async def generate_text(data: RequestData):
    formatted_prompts = [
        f"<|system|>{SYSTEM_MESSAGE}\n<user>: {prompt}\n<assistant>:" for prompt in data.prompts
    ]
    sampling_params = SamplingParams(
        max_tokens=data.max_tokens,
        temperature=data.temperature,
    )
    
    results = llm.generate(formatted_prompts, sampling_params)
    responses = results
    return {"responses": responses}

if __name__ == "__main__":
    uvicorn.run("deployment:app", host="0.0.0.0", port=8000, reload=True)