运行

文档：https://lightllm-cn.readthedocs.io/en/latest/getting_started/quickstart.html

运行PD分离

pd_master

CUDA_VISIBLE_DEVICES=0  python -m lightllm.server.api_server \
--model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
--run_mode "pd_master" \
--host 10.119.46.53 \
--port 60011

运行prefill服务

CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
--run_mode "prefill" \
--host 10.119.46.53 \
--port 8017 \
--tp 2 \
--nccl_port 2732 \
--max_total_token_num 200000 \
--tokenizer_mode fast \
--pd_master_ip 10.119.46.53 \
--pd_master_port 60011 \
--use_dynamic_prompt_cache \
--max_req_total_len 16000 \
--running_max_req_size 128 \
--disable_cudagraph

运行decode服务

CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
--run_mode "decode" \
--host 10.119.46.53 \
--port 8118 \
--nccl_port 12322 \
--tp 2 \
--max_total_token_num 200000 \
--graph_max_len_in_batch 2048 \
--graph_max_batch_size 16 \
--tokenizer_mode fast \
--pd_master_ip 10.119.46.53 \
--pd_master_port 60011 \
--use_dynamic_prompt_cache

测试

curl 10.119.46.53:60011/generate \
-H "Content-Type: application/json" \
-d '{
"inputs": "What is AI?",
"parameters":{
}
}'

启动master

CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \
--model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
--run_mode "pd_master" \
--host 192.168.0.20 \
--port 60011

启动prefill

CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
--run_mode "prefill" \
--host 192.168.0.20 \
--port 8017 \
--tp 1 \
--nccl_port 2732 \
--max_total_token_num 20000 \
--tokenizer_mode fast \
--pd_master_ip 192.168.0.20 \
--pd_master_port 60011 \
--use_dynamic_prompt_cache \
--max_req_total_len 10000 \
--running_max_req_size 128 \
--disable_cudagraph

启动decode

CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
--run_mode "decode" \
--host 192.168.0.20 \
--port 8118 \
--nccl_port 12322 \
--tp 1 \
--max_total_token_num 20000 \
--graph_max_len_in_batch 2048 \
--graph_max_batch_size 16 \
--tokenizer_mode fast \
--pd_master_ip 192.168.0.20 \
--pd_master_port 60011 \
--use_dynamic_prompt_cache

测试

curl http://192.168.0.20:60011/generate \
-H "Content-Type: application/json" \
-d '{
"inputs": "What is AI?",
"parameters":{
"max_new_tokens":17,
"frequency_penalty":1
}
}'

启动模型服务

python -m lightllm.server.api_server -use_dynamic_prompt_cache "True" --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf

execute

运行

运行PD分离

启动模型服务

results matching ""

No results matching ""