运行

文档:https://lightllm-cn.readthedocs.io/en/latest/getting_started/quickstart.html

运行PD分离

  1. pd_master

    CUDA_VISIBLE_DEVICES=0  python -m lightllm.server.api_server \
    --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
    --run_mode "pd_master" \
    --host 10.119.46.53 \
    --port 60011
    
  2. 运行prefill服务

    CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
    --run_mode "prefill" \
    --host 10.119.46.53 \
    --port 8017 \
    --tp 2 \
    --nccl_port 2732 \
    --max_total_token_num 200000 \
    --tokenizer_mode fast \
    --pd_master_ip 10.119.46.53 \
    --pd_master_port 60011 \
    --use_dynamic_prompt_cache \
    --max_req_total_len 16000 \
    --running_max_req_size 128 \
    --disable_cudagraph
    
  3. 运行decode服务

    CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93/ \
    --run_mode "decode" \
    --host 10.119.46.53 \
    --port 8118 \
    --nccl_port 12322 \
    --tp 2 \
    --max_total_token_num 200000 \
    --graph_max_len_in_batch 2048 \
    --graph_max_batch_size 16 \
    --tokenizer_mode fast \
    --pd_master_ip 10.119.46.53 \
    --pd_master_port 60011 \
    --use_dynamic_prompt_cache
    
  4. 测试

    curl 10.119.46.53:60011/generate \
    -H "Content-Type: application/json" \
    -d '{
    "inputs": "What is AI?",
    "parameters":{
    }
    }'
    
  1. 启动master

    CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \
    --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
    --run_mode "pd_master" \
    --host 192.168.0.20 \
    --port 60011
    
  2. 启动prefill

    CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
    --run_mode "prefill" \
    --host 192.168.0.20 \
    --port 8017 \
    --tp 1 \
    --nccl_port 2732 \
    --max_total_token_num 20000 \
    --tokenizer_mode fast \
    --pd_master_ip 192.168.0.20 \
    --pd_master_port 60011 \
    --use_dynamic_prompt_cache \
    --max_req_total_len 10000 \
    --running_max_req_size 128 \
    --disable_cudagraph
    
  3. 启动decode

    CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 \
    --run_mode "decode" \
    --host 192.168.0.20 \
    --port 8118 \
    --nccl_port 12322 \
    --tp 1 \
    --max_total_token_num 20000 \
    --graph_max_len_in_batch 2048 \
    --graph_max_batch_size 16 \
    --tokenizer_mode fast \
    --pd_master_ip 192.168.0.20 \
    --pd_master_port 60011 \
    --use_dynamic_prompt_cache
    
  4. 测试

    curl http://192.168.0.20:60011/generate \
    -H "Content-Type: application/json" \
    -d '{
    "inputs": "What is AI?",
    "parameters":{
    "max_new_tokens":17,
    "frequency_penalty":1
    }
    }'
    

启动模型服务

python -m lightllm.server.api_server -use_dynamic_prompt_cache "True" --model_dir ~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf

results matching ""

    No results matching ""