Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions examples/wide_ep/slurm_scripts/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ workdir=<workdir> # Path to disaggr_torch.slurm
model_dir=<model_dir> # Path to the model checkpoint

mtp_size=0
ntasks_per_node=4 # 4 GPUs per GB200 node
ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node

isl=1024
osl=1024
Expand All @@ -22,8 +22,9 @@ streaming=true
for b in 1 64 1024; do
for eplb_num_slots in 0 256 288; do
concurrency=$((b * 16))
ctx_num=$(((concurrency + 5499)/5500))
total_node_num=$((ctx_num + 4))
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
total_node_num=$((ctx_node_num + 4)) # $((ctx_node_num + 2)) for B200
ntasks=$((total_node_num * ntasks_per_node))

args=(
Expand Down Expand Up @@ -56,8 +57,9 @@ done
# dep32 eplb288
for b in 512; do
concurrency=$((b * 32))
ctx_num=$(((concurrency + 5499)/5500))
total_node_num=$((ctx_num + 8))
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
total_node_num=$((ctx_node_num + 8)) # $((ctx_node_num + 4)) for B200
ntasks=$((total_node_num * ntasks_per_node))
eplb_num_slots=288

Expand Down