Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Add offline test for disaggregated prefill #12418

Merged
merged 4 commits into from
Feb 8, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions examples/offline_inference/disaggregated_prefill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import time
from multiprocessing import Event, Process

from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig


def run_prefill(prefill_done):
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

prompts = [
"Hello, my name is",
# "Hi, your name is", # To simulate transmission failure
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8)

llm.generate(prompts, sampling_params)
print("Prefill node is finished.")
prefill_done.set()

# To keep the prefill node running in case the decode node is not done
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Script stopped by user.")


def run_decode(prefill_done):
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

prompts = [
"Hello, my name is",
"Hi, your name is",
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8)

# Wait for the producer to start the pipe
print("Waiting for prefill node to finish...")
prefill_done.wait()

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == "__main__":
prefill_done = Event()
process_a = Process(target=run_prefill, args=(prefill_done, ))
process_b = Process(target=run_decode, args=(prefill_done, ))

# Start prefill node
process_a.start()

# Start decode node
process_b.start()

process_b.join()
process_a.terminate()
Loading