运行
文档:https://lightllm-cn.readthedocs.io/en/latest/getting_started/quickstart.html
运行PD分离
pd_master
CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \ --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \ --run_mode "pd_master" \ --host 10.119.46.53 \ --port 60011
运行prefill服务
CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \ --run_mode "prefill" \ --host 10.119.46.53 \ --port 8017 \ --tp 2 \ --nccl_port 2732 \ --max_total_token_num 200000 \ --tokenizer_mode fast \ --pd_master_ip 10.119.46.53 \ --pd_master_port 60011 \ --use_dynamic_prompt_cache \ --max_req_total_len 16000 \ --running_max_req_size 128 \ --disable_cudagraph
运行decode服务
CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \ --run_mode "decode" \ --host 10.119.46.53 \ --port 8118 \ --nccl_port 12322 \ --tp 2 \ --max_total_token_num 200000 \ --graph_max_len_in_batch 2048 \ --graph_max_batch_size 16 \ --tokenizer_mode fast \ --pd_master_ip 10.119.46.53 \ --pd_master_port 60011 \ --use_dynamic_prompt_cache
测试
curl 10.119.46.53:60011/generate \ -H "Content-Type: application/json" \ -d '{ "inputs": "What is AI?", "parameters":{ } }'
启动master
CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \ --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \ --run_mode "pd_master" \ --host 192.168.0.20 \ --port 60011
启动prefill
CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \ --run_mode "prefill" \ --host 192.168.0.20 \ --port 8017 \ --tp 1 \ --nccl_port 2732 \ --max_total_token_num 20000 \ --tokenizer_mode fast \ --pd_master_ip 192.168.0.20 \ --pd_master_port 60011 \ --use_dynamic_prompt_cache \ --max_req_total_len 10000 \ --running_max_req_size 128 \ --disable_cudagraph
启动decode
CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \ --run_mode "decode" \ --host 192.168.0.20 \ --port 8118 \ --nccl_port 12322 \ --tp 1 \ --max_total_token_num 20000 \ --graph_max_len_in_batch 2048 \ --graph_max_batch_size 16 \ --tokenizer_mode fast \ --pd_master_ip 192.168.0.20 \ --pd_master_port 60011 \ --use_dynamic_prompt_cache
测试
curl http://192.168.0.20:60011/generate \ -H "Content-Type: application/json" \ -d '{ "inputs": "What is AI?", "parameters":{ "max_new_tokens":17, "frequency_penalty":1 } }'
启动模型服务
python -m lightllm.server.api_server -use_dynamic_prompt_cache "True" --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf