diff --git a/merge_zero3_safetensors.sh b/merge_zero3_safetensors.sh index 3eaaa37..7844f60 100755 --- a/merge_zero3_safetensors.sh +++ b/merge_zero3_safetensors.sh @@ -2,7 +2,7 @@ set -euo pipefail # ===== 可调参数 ===== -CKPT_ROOT="/home/test/checkpoints/q3-32b-ds4" +CKPT_ROOT="/home/test/checkpoints/q3-32b-ds4" # 如果分片实际在 checkpoint-62/global_step62 下,就把这里改成 .../checkpoint-62 TAG="global_step62" HOSTS=(tn01 tn02 tn03 tn04 tn05 tn06) OUT_DIR="${CKPT_ROOT}/merged-${TAG}" @@ -11,64 +11,67 @@ SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout RSYNC_OPTS="-a --info=progress2 --human-readable --partial --inplace" # ==================== -echo "== 预检查 SSH 与远端目录 ==" +echo "== 预检查 SSH 与(非必需)远端目录存在 ==" for h in "${HOSTS[@]}"; do - if ! ssh ${SSH_OPTS} "$h" "true" >/dev/null 2>&1; then - echo "!! 无法免密 SSH 到 $h(检查 ~/.ssh/config/authorized_keys/防火墙)" >&2 - exit 1 - fi - if ! ssh ${SSH_OPTS} "$h" "test -d '${CKPT_ROOT}/${TAG}'"; then - echo "!! $h 上缺少目录 ${CKPT_ROOT}/${TAG},确认训练是否在该机产生了分片" >&2 - fi + ssh ${SSH_OPTS} "$h" "true" >/dev/null || { echo "!! 无法免密 SSH 到 $h"; exit 1; } + # 目录不存在也不致命,后面会跳过 done -echo "== 1/4 开始按节点同步分片(仅 ${TAG},带进度)==" -mkdir -p "${CKPT_ROOT}" +echo "== 1/4 同步各节点的 ${TAG} 整个目录(带进度)==" +mkdir -p "${CKPT_ROOT}/${TAG}" LOCAL_HOST="$(hostname -s || hostname)" for h in "${HOSTS[@]}"; do [[ "$h" == "$LOCAL_HOST" ]] && { echo " - 跳过本机 $h"; continue; } - echo " - 从 $h 拉取 ${CKPT_ROOT}/${TAG}/mp_rank_*/" - # 只拉取该 step 下的 mp_rank_* 目录,避免无关文件 - rsync ${RSYNC_OPTS} -e "ssh ${SSH_OPTS}" \ - --include="${TAG}/" --include="${TAG}/mp_rank_*/" --include="${TAG}/mp_rank_*/**" \ - --exclude="*" \ - "${h}:${CKPT_ROOT}/" "${CKPT_ROOT}/" + if ssh ${SSH_OPTS} "$h" "test -d '${CKPT_ROOT}/${TAG}'"; then + echo " - 从 $h 拉取 ${CKPT_ROOT}/${TAG}/" + # 不做 include/exclude 过滤,避免漏掉不同命名风格的分片文件 + rsync ${RSYNC_OPTS} -e "ssh ${SSH_OPTS}" \ + "${h}:${CKPT_ROOT}/${TAG}/" "${CKPT_ROOT}/${TAG}/" || true + else + echo " - $h 无 ${CKPT_ROOT}/${TAG},跳过" + fi done -echo "== 2/4 校验是否凑齐分片目录 ==" -if [[ ! -d "${CKPT_ROOT}/${TAG}" ]]; then - echo "!! 未发现 ${CKPT_ROOT}/${TAG}" >&2; exit 1 -fi -MP_CNT=$(find "${CKPT_ROOT}/${TAG}" -maxdepth 1 -type d -name "mp_rank_*" | wc -l | tr -d ' ') -echo " - 已发现 mp_rank_* 目录数:${MP_CNT}" -if [[ "${MP_CNT}" -eq 0 ]]; then - echo "!! 没有任何 mp_rank_* 分片,请检查同步" >&2; exit 1 +echo "== 2/4 校验是否有分片“文件”(不是目录)==" +# 兼容两种常见命名:mp_rank_*_model_states.pt 与 *mp_rank*model_states.pt(含 pp 维度) +CNT_A=$(ls -1 "${CKPT_ROOT}/${TAG}"/mp_rank_*_model_states.pt 2>/dev/null | wc -l | tr -d ' ' || true) +CNT_B=$(ls -1 "${CKPT_ROOT}/${TAG}"/*mp_rank*model_states.pt 2>/dev/null | wc -l | tr -d ' ' || true) +CNT=$(( CNT_A + CNT_B )) +echo " - 发现 model_states 分片文件数:${CNT}" +if [[ "${CNT}" -eq 0 ]]; then + echo "!! 未检测到任何 *_model_states.pt;请在各机上 ls 看看 ${CKPT_ROOT}/${TAG} 的实际文件名,再调整匹配规则" >&2 + exit 1 fi echo "== 3/4 合并为 safetensors 输出到:${OUT_DIR} ==" -python - </dev/null for f in config.json generation_config.json tokenizer_config.json tokenizer.json merges.txt vocab.json special_tokens_map.json added_tokens.json; do - [[ -f "$f" ]] && cp -n "$f" "${OUT_DIR}/" + [[ -f "${CKPT_ROOT}/${f}" ]] && cp -n "${CKPT_ROOT}/${f}" "${OUT_DIR}/" done -popd >/dev/null echo "== 4/4 自检(索引与config)==" python - <<'PY'