diff --git a/.agents/skills b/.agents/skills new file mode 120000 index 00000000000..42c5394a18a --- /dev/null +++ b/.agents/skills @@ -0,0 +1 @@ +../skills \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000000..8749647f9ac --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,14 @@ +{ + "hooks": { + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "printf '{\"hookSpecificOutput\":{\"hookEventName\":\"UserPromptSubmit\",\"additionalContext\":\"MANDATORY WORKFLOW — never skip or reorder: (1) Read the artifact first (commit, file, error, PR). (2) Identify and invoke the relevant skill via the Skill tool BEFORE forming any answer or plan — even when the answer seems obvious. (3) Only then answer using the skill context. Skipping step 2 is not allowed.\"}}'" + } + ] + } + ] + } +} diff --git a/.claude/skills b/.claude/skills new file mode 120000 index 00000000000..42c5394a18a --- /dev/null +++ b/.claude/skills @@ -0,0 +1 @@ +../skills \ No newline at end of file diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 00000000000..160bda5f0f6 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,39 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json +language: "en-US" + +# Only comment on Critical/Major bugs. No Minor, Trivial, or style comments. +tone_instructions: "Only comment on Critical or Major bugs. Never comment on Minor issues, style, refactoring, or suggestions. When in doubt, stay silent." + +reviews: + # Use chill profile - filters out nitpicks automatically + profile: "chill" + + # Disable all summary features + high_level_summary: false + high_level_summary_in_walkthrough: false + + # Disable walkthrough comment entirely + collapse_walkthrough: true + changed_files_summary: false + sequence_diagrams: false + + # Disable status/effort estimates + review_status: false + commit_status: false + estimate_code_review_effort: false + + # Disable auto-suggestions for labels/reviewers + suggested_labels: false + suggested_reviewers: false + + # Disable related issues/PRs lookup + assess_linked_issues: false + related_issues: false + related_prs: false + + # Auto-review disabled - only review when explicitly requested via @coderabbitai review + auto_review: + enabled: false + +chat: + auto_reply: true diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 29de6ff8a38..00000000000 --- a/.coveragerc +++ /dev/null @@ -1,5 +0,0 @@ -[html] -directory = coverage - -[run] -data_file = .coverage_$LOCAL_RANK diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 00000000000..6297a7a53f4 --- /dev/null +++ b/.cursorrules @@ -0,0 +1 @@ +See CLAUDE.md for all repository guidelines. diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000000..1e35e0c496b --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 100 +extend-ignore = E203,E501,F401,E402,E714 +per-file-ignores = __init__.py:F401 \ No newline at end of file diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000000..abfd15e56b5 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,69 @@ +megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo + +megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt + +megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal + +megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba +megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba + +megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model + +megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets + +megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers + +megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp + +megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp + +megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing + +megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer + +megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference + +megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets + +megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism + +megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/transformer + +megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech + +megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference + +megatron/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference-interface + +megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo + +megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training + +megatron/post_training/ @NVIDIA/post-training + +megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs + +megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo +megatron/training/arguments.py + +.gitlab/ @NVIDIA/ci +.github/ @NVIDIA/ci +.github/oncall_schedule.json @NVIDIA/mcore-oncall-rotation +.gitlab-ci.yml @NVIDIA/ci +docker/ @NVIDIA/ci +tests/functional_tests/python_test_utils/ @NVIDIA/ci +tests/functional_tests/shell_test_utils/ @NVIDIA/ci +tests/test_utils/recipes/ @NVIDIA/ci +tests/unit_tests/run_ci_test.sh @NVIDIA/ci + +# API Backwards Compatibility Check +scripts/check_api_backwards_compatibility.py @NVIDIA/ci +scripts/README_API_COMPAT.md @NVIDIA/ci +.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci +docs/api-backwards-compatibility-check.md @NVIDIA/ci +tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci + +megatron/rl/ @NVIDIA/reinforcement-learning +examples/rl/ @NVIDIA/reinforcement-learning +test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning +train_rl.py @NVIDIA/reinforcement-learning diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000000..9662160da10 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,29 @@ +--- +name: Bug report +about: Create a report to help us improve the repository or project +title: "" +labels: bug +assignees: '' + +--- + +**Describe the bug** + +A clear and concise description of what the bug is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) +to get oncall's attention to this issue. + +**Steps/Code to reproduce bug** + +Please list *minimal* steps or code snippet for us to be able to reproduce the bug. + +A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports. + + +**Expected behavior** + +A clear and concise description of what you expected to happen. + + +**Additional context** + +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000000..99d680b0ab4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,2 @@ +blank_issues_enabled: false + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000000..b0da6789a8e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,23 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "" +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) +to get oncall's attention to this issue. + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 00000000000..899ff44d6a6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,13 @@ +--- +name: QUESTION +about: Ask a question about Megatron-LM that is not a bug, regression or enhancement + request +title: "[QUESTION]" +labels: '' +assignees: '' + +--- + +**Your question** +Ask a clear and concise question about Megatron-LM. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) +to get oncall's attention to this issue. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/regression.md b/.github/ISSUE_TEMPLATE/regression.md new file mode 100644 index 00000000000..180db633cb8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/regression.md @@ -0,0 +1,40 @@ +--- +name: REGRESSION +about: Report a regression in speed or accuracy due to a Megatron-LM update +title: "[REGRESSION]" +labels: '' +assignees: '' + +--- + +**Describe the regression** +A clear and concise description of what the regression is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) +to get oncall's attention to this issue. + +**To Reproduce** +Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. + +**Previous performance** +What speed or accuracy did you previously see. + +**New performance** +What speed or accuracy do you see after the update. + +**Stack trace/logs** +If applicable, add the stack trace or logs related to the regression. + +**Environment (please complete the following information):** + - Previous Megatron-LM commit ID + - New Megatron-LM commit ID + - Previous PyTorch version + - New PyTorch version + - Previous CUDA version + - New CUDA version + - Previous NCCL version + - New NCCL version + +**Proposed fix** +If you have a proposal for how to fix the issue state it here or link to a PR. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/actions/action.yml b/.github/actions/action.yml new file mode 100644 index 00000000000..6ca5e5828ec --- /dev/null +++ b/.github/actions/action.yml @@ -0,0 +1,294 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: "Test Template" +description: "Template for running NeMo tests in a containerized environment" + +inputs: + container-image: + description: "Container image to use for test" + required: true + timeout: + description: "Max runtime of test in minutes" + required: false + default: "30" + script: + description: "Test script to execute" + required: true + is-optional: + description: "Pass this job on failure." + required: false + default: "false" + is_unit_test: + description: "Upload coverage as unit test" + required: false + default: "false" + tag: + description: Latest or legacy test suite + required: true + test_case: + description: Test case to launch + required: true + model: + description: Model to launch + required: false + PAT: + description: "GitHub Personal Access Token" + required: true + scope: + description: "Test scope (e.g. mr-github, mr-github-slim)" + required: false + default: "mr-github-slim" + n_repeat: + description: "Number of test repetitions" + required: false + default: "5" + lightweight: + description: "Enable lightweight mode" + required: false + default: "false" + platform: + description: "Platform to run tests on (e.g. dgx_h100, dgx_gb200)" + required: false + default: "dgx_h100" + cadence: + description: "Trigger cadence for cadence filter (pr|nightly|mergegroup). Empty disables filter." + required: false + default: "" +runs: + using: "composite" + steps: + - name: Print node name + shell: bash -x -e -u -o pipefail {0} + run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT" + + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Change ownership of /home/runner/ + shell: bash + run: sudo chown -R $(whoami) /home/runner/ + + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install uuid-runtime + shell: bash -x -e -u -o pipefail {0} + run: | + for i in 1 2 3; do + apt-get update && apt-get install -y uuid-runtime && break + echo "apt attempt $i failed, retrying..." + sleep 10 + done + + - name: Install uv + shell: bash -x -e -u -o pipefail {0} + run: | + for i in 1 2 3; do + curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/local/bin sh && break + echo "uv install attempt $i failed, retrying..." + sleep 10 + done + + - name: Create run-script (unit test) + shell: bash -x -e -u -o pipefail {0} + if: inputs.is_unit_test == 'true' + run: | + echo "::group::Create run-script" + cmd=$(cat <<'RUN_TEST_EOF' + #!/bin/bash + + export PYTHONPATH=$(pwd) + export NEMORUN_HOME=$(pwd) + export NCCL_DEBUG=INFO + uv venv .venv + uv cache clean + uv sync --no-cache --only-group test + uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ + --scope unit-tests \ + --model unit-tests \ + --test-case "${{ inputs.test_case }}" \ + --environment dev \ + --platform ${{ inputs.platform }} \ + --tag ${{ inputs.tag }} \ + --container-image ${{ inputs.container-image }} \ + --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME + + RUN_TEST_EOF + ) + echo "$cmd" | tee "job.sh" + echo "::endgroup::" + + - name: Create run-script (e2e test) + shell: bash -x -e -u -o pipefail {0} + if: inputs.is_unit_test == 'false' + env: + MODEL: ${{ inputs.model }} + run: | + echo "::group::Create run-script" + cmd=$(cat <<'RUN_TEST_EOF' + #!/bin/bash + set -euxo pipefail + + ARGS=( + --scope ${{ inputs.scope }} + --n-repeat ${{ inputs.n_repeat }} + ) + if [ "${{ inputs.lightweight }}" == "true" ]; then + ARGS+=(--enable-lightweight-mode) + fi + if [ -n "${{ inputs.cadence }}" ]; then + ARGS+=(--cadence ${{ inputs.cadence }}) + fi + + export PYTHONPATH=$(pwd) + export NEMORUN_HOME=$(pwd) + uv venv .venv + uv cache clean + uv sync --no-cache --only-group test + uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \ + ${ARGS[@]} \ + --model ${{ inputs.model }} \ + --test-case ${{ inputs.test_case }} \ + --environment dev \ + --platform ${{ inputs.platform }} \ + --container-image ${{ inputs.container-image }} \ + --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \ + --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME + + RUN_TEST_EOF + ) + echo "$cmd" | tee "job.sh" + echo "::endgroup::" + + - name: Set timeout + shell: bash -x -e -u -o pipefail {0} + id: timeout_in_seconds + run: echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT" + + - name: Pull container + shell: bash -x -e -u -o pipefail {0} + run: | + echo "::group::Pull container" + docker pull ${{ inputs.container-image }} + echo "::endgroup::" + + - name: Run main script + shell: bash -x -e -u -o pipefail {0} + id: run-main-script + run: | + { set +x; } 2>/dev/null + echo -e "\033[1;34m┌─ launching test ─────────────────────────────────────────────────────────┐\033[0m" + echo -e "\033[1;34m│ test case : ${{ inputs.test_case }}\033[0m" + echo -e "\033[1;34m│ platform : ${{ inputs.platform }} scope: ${{ inputs.scope }}\033[0m" + echo -e "\033[1;34m│ container : ${{ inputs.container-image }}\033[0m" + echo -e "\033[1;34m└──────────────────────────────────────────────────────────────────────────┘\033[0m" + { set -x; } 2>/dev/null + echo "::group::Logs" + EXIT_CODE=0 + /bin/bash job.sh || EXIT_CODE=$? + echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT" + echo "::endgroup::" + exit $EXIT_CODE + + - name: Check result + id: check + shell: bash -e -u -o pipefail {0} + if: always() + env: + IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }} + MAIN_CONCLUSION: ${{ steps.run-main-script.conclusion }} + MAIN_EXIT_CODE: ${{ steps.run-main-script.outputs.exit_code }} + run: | + logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(cat /proc/sys/kernel/random/uuid) + echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT" + if [[ "$IS_UNIT_TEST" == "true" ]]; then + coverage_report=coverage-unit-test-${{ github.run_id }}-$(cat /proc/sys/kernel/random/uuid) + else + coverage_report=none + fi + echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT" + + EXIT_CODE="${MAIN_EXIT_CODE:-${MAIN_CONCLUSION}}" + if [[ "$MAIN_CONCLUSION" == "success" ]]; then + IS_SUCCESS=true + else + IS_SUCCESS=false + fi + + if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is-optional }}" == "true" ]]; then + echo "::warning::Test failed but is marked optional — treating as success." + IS_SUCCESS=true + fi + + LOG_BASE=$([[ "$IS_UNIT_TEST" == "true" ]] && echo "assets_dir/logs" || echo "assets_dir") + LATEST_LOG="" + if [[ -d "$LOG_BASE" ]]; then + LATEST_LOG=$(find "$LOG_BASE" -name "*.log" ! -name "nccl_debug.log" -type f 2>/dev/null \ + | xargs -r ls -t 2>/dev/null | head -1 || true) + fi + if [[ -n "$LATEST_LOG" ]]; then + echo -e "\033[1;36m\n📋 ── log excerpt ───────────────────────────────────────────────────────\033[0m" + echo -e "\033[1;36m ${LATEST_LOG} — last 40 lines\033[0m" + echo -e "\033[1;36m────────────────────────────────────────────────────────────────────────\033[0m" + tail -40 "$LATEST_LOG" + echo -e "\033[1;36m────────────────────────────────────────────────────────────────────────\033[0m\n" + else + echo -e "\033[33m⚠ no log file found in ${LOG_BASE}\033[0m" + fi + + if [[ "$coverage_report" != "none" ]]; then + echo "::group::Coverage report" + uv run coverage report -i || true + echo "::endgroup::" + fi + + if [[ "$IS_SUCCESS" == "true" ]]; then + echo -e "\033[1;32m╔══════════════════════════════════════════════════════════════════════════╗\033[0m" + echo -e "\033[1;32m║ ║\033[0m" + echo -e "\033[1;32m║ ✅ PASSED ║\033[0m" + echo -e "\033[1;32m║ ${{ inputs.test_case }}\033[0m" + echo -e "\033[1;32m║ ║\033[0m" + echo -e "\033[1;32m╚══════════════════════════════════════════════════════════════════════════╝\033[0m" + echo "::notice title=Result::✅ ${{ inputs.test_case }} — PASSED" + exit 0 + else + echo -e "\033[1;31m╔══════════════════════════════════════════════════════════════════════════╗\033[0m" + echo -e "\033[1;31m║ ║\033[0m" + echo -e "\033[1;31m║ ❌ FAILED (exit code: ${EXIT_CODE}) ║\033[0m" + echo -e "\033[1;31m║ ${{ inputs.test_case }}\033[0m" + echo -e "\033[1;31m║ ║\033[0m" + echo -e "\033[1;31m╚══════════════════════════════════════════════════════════════════════════╝\033[0m" + echo "::error title=Result::❌ ${{ inputs.test_case }} — FAILED (exit $EXIT_CODE)" + exit 1 + fi + + - name: Upload coverage + uses: actions/upload-artifact@v6 + if: ${{ always() && steps.check.outputs.coverage_report != 'none' }} + with: + name: ${{ steps.check.outputs.coverage_report }} + path: | + coverage.xml + .coverage + include-hidden-files: true + + - name: Upload logs + uses: actions/upload-artifact@v6 + if: always() + with: + name: ${{ steps.check.outputs.logs_report }} + path: ${{ inputs.is_unit_test == 'true' && 'assets_dir/logs' || 'assets_dir' }} + include-hidden-files: true diff --git a/.github/actions/check-nvidia-sso-membership/action.yml b/.github/actions/check-nvidia-sso-membership/action.yml new file mode 100644 index 00000000000..71926c4547d --- /dev/null +++ b/.github/actions/check-nvidia-sso-membership/action.yml @@ -0,0 +1,139 @@ +name: 'Check NVIDIA SSO Membership' +description: 'Check if a GitHub username exists in the NVIDIA SSO users list from github-audits' +author: 'NVIDIA' + +inputs: + username: + description: 'GitHub username to check' + required: true + github_audits_repo: + description: 'Repository containing SSO users file' + required: false + default: 'NVIDIA-GitHub-Management/github-audits' + github_audits_version: + description: 'Release version tag' + required: false + default: 'v0.1.0' + sso_users_filename: + description: 'Filename of SSO users JSON' + required: false + default: 'users_sso.json' + github_token: + description: 'GitHub token with access to github-audits repo' + required: true + +outputs: + is_member: + description: 'Boolean - true if user is in NVIDIA SSO list, false otherwise' + value: ${{ steps.check-membership.outputs.is_member }} + is_org_member: + description: 'Boolean - true if user has NVIDIA or NVIDIA-NeMo in org_roles' + value: ${{ steps.check-membership.outputs.is_org_member }} + user_orgs: + description: 'Comma-separated list of orgs user is member of' + value: ${{ steps.check-membership.outputs.user_orgs }} + sso_file_available: + description: 'Boolean - true if SSO file was successfully downloaded' + value: ${{ steps.download-sso.outputs.sso_file_available }} + user_count: + description: 'Number of users in the SSO file (0 if download failed)' + value: ${{ steps.download-sso.outputs.user_count }} + +runs: + using: 'composite' + steps: + - name: Download NVIDIA SSO users from github-audits + id: download-sso + shell: bash + env: + GH_TOKEN: ${{ inputs.github_token }} + run: | + echo "Downloading ${{ inputs.sso_users_filename }} from ${{ inputs.github_audits_repo }} ${{ inputs.github_audits_version }} release..." + + # Download the release asset using gh CLI + gh release download ${{ inputs.github_audits_version }} \ + --repo ${{ inputs.github_audits_repo }} \ + --pattern ${{ inputs.sso_users_filename }} \ + --clobber 2>&1 || { + echo "ERROR: Failed to download ${{ inputs.sso_users_filename }} from github-audits release" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + } + + # Verify file was downloaded and is valid JSON + if [ ! -f ${{ inputs.sso_users_filename }} ]; then + echo "ERROR: ${{ inputs.sso_users_filename }} file not found after download" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + fi + + # Validate JSON structure + if ! jq -e 'type == "object"' ${{ inputs.sso_users_filename }} > /dev/null 2>&1; then + echo "ERROR: ${{ inputs.sso_users_filename }} is not a valid JSON object" + echo "sso_file_available=false" >> $GITHUB_OUTPUT + echo "user_count=0" >> $GITHUB_OUTPUT + exit 0 + fi + + USER_COUNT=$(jq 'length' ${{ inputs.sso_users_filename }}) + echo "Successfully downloaded ${{ inputs.sso_users_filename }} with $USER_COUNT NVIDIA SSO users" + echo "sso_file_available=true" >> $GITHUB_OUTPUT + echo "user_count=$USER_COUNT" >> $GITHUB_OUTPUT + + - name: Check if user is in SSO list + id: check-membership + shell: bash + run: | + USERNAME="${{ inputs.username }}" + SSO_FILE="${{ inputs.sso_users_filename }}" + + echo "Checking if $USERNAME is in NVIDIA SSO users list..." + + # Check if SSO file is available + if [ "${{ steps.download-sso.outputs.sso_file_available }}" != "true" ] || [ ! -f "$SSO_FILE" ]; then + echo "ERROR: $SSO_FILE not available - cannot check membership" + echo "is_member=false" >> $GITHUB_OUTPUT + echo "is_org_member=false" >> $GITHUB_OUTPUT + echo "user_orgs=" >> $GITHUB_OUTPUT + exit 0 + fi + + # Check if username exists as a key in the JSON object + if jq -e --arg user "$USERNAME" 'has($user)' "$SSO_FILE" > /dev/null 2>&1; then + echo "$USERNAME found in NVIDIA SSO users" + echo "is_member=true" >> $GITHUB_OUTPUT + + # Extract and check org membership + IS_ORG_MEMBER=$(jq -r --arg user "$USERNAME" ' + .[$user].org_roles // [] | + map(select(test("^(NVIDIA|NVIDIA-NeMo):Member$"))) | + length > 0 + ' "$SSO_FILE") + + USER_ORGS=$(jq -r --arg user "$USERNAME" ' + .[$user].org_roles // [] | + map(split(":")[0]) | + unique | + join(",") + ' "$SSO_FILE") + + echo "is_org_member=$IS_ORG_MEMBER" >> $GITHUB_OUTPUT + echo "user_orgs=$USER_ORGS" >> $GITHUB_OUTPUT + + if [ "$IS_ORG_MEMBER" == "true" ]; then + echo "$USERNAME is a member of NVIDIA or NVIDIA-NeMo org" + else + echo "$USERNAME has @nvidia.com email but is not in NVIDIA or NVIDIA-NeMo org (orgs: $USER_ORGS)" + fi + else + echo "$USERNAME NOT found in NVIDIA SSO users" + echo "is_member=false" >> $GITHUB_OUTPUT + echo "is_org_member=false" >> $GITHUB_OUTPUT + echo "user_orgs=" >> $GITHUB_OUTPUT + fi + +branding: + icon: 'shield' + color: 'green' diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 00000000000..c4f81fbcb7d --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +enabled: true +auto_sync_draft: false +auto_sync_ready: true +trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "Connor-XY", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Mellonta", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "WanZzzzzz", "Wohox", "YangFei1990", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "aroshanghias-nvd", "asolergi-nv", "balasaajay", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "fitsumreda", "frsun-nvda", "gautham-kollu", "gdengk", "guihong-nv", "guyueh1", "hexinw-nvidia", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "minitu", "mkhona-nvidia", "nanz-nv", "ntajbakhsh", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "sheliang-nv", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sraman-rgb", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "wujingyue", "xiaoyao0115", "xuantengh", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"] diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json new file mode 100644 index 00000000000..2f6e01c786c --- /dev/null +++ b/.github/oncall_schedule.json @@ -0,0 +1,50 @@ +[ + { + "user": "janEbert", + "date": "2026-05-06" + }, + { + "user": "dimapihtar", + "date": "2026-05-13" + }, + { + "user": "ilml", + "date": "2026-05-20" + }, + { + "user": "wujingyue", + "date": "2026-05-27" + }, + { + "user": "Connor-XY", + "date": "2026-06-03" + }, + { + "user": "guihong-nv", + "date": "2026-06-10" + }, + { + "user": "Phlip79", + "date": "2026-06-17" + }, + { + "user": "asolergi-nv", + "date": "2026-06-24" + }, + { + "user": "maanug-nv", + "date": "2026-07-01" + }, + { + "user": "wujingyue", + "date": "2026-07-08" + }, + { + "user": "Connor-XY", + "date": "2026-07-15" + }, + { + "user": "Phlip79", + "date": "2026-07-22" + } +] diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000000..8f319e66f87 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,59 @@ +# What does this PR do ? + + +:warning: For major changes (either in lines of code or in its impact), please make sure to first share a design doc with the team. If you're unsure what's the best way to do so, contact the @mcore-oncall. + +## Issue tracking + +For PRs from open-source community contributors: + +- **New features**: a linked issue is **required**. Please open a [feature request](https://github.com/NVIDIA/Megatron-LM/issues/new?template=feature_request.md) and reference it here before submitting the PR. +- **Small updates (bug fixes, minor improvements)**: a linked issue is **recommended** and will accelerate the PR review process. + +Linked issue: + +## Contribution process + +### Pre-checks + +- [ ] I have added relevant unit tests +- [ ] I have added relevant functional tests +- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html) +- [ ] I have added relevant documentation +- [ ] I have run the [autoformatter.sh](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/autoformat.sh) on my PR + +### Code review + +Feel free to message or comment the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged! + +All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft. + +#### Step 1: Mark PR as "Ready for Review" + +1. When your PR is ready, click **Ready for Review**. +2. An oncall reviewer is auto-assigned and expert reviewers are notified based on your changes. + - Some PRs may jump straight to step 2. This is determined by `.github/CODEOWNERS`. + +:warning: Only mark as ready once merge-conflicts are resolved and the CI is passing. +Final Review might get declined if these requirements are not fulfilled. + +#### Step 2: Final Review + +For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned. + +For PRs outside `megatron/core`, this step is skipped. + +#### Step 3: Approved + +Once all required reviewers have approved, the `Approved` label is applied **automatically**. + +### Merge + +Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR. + +
+For MRs into `dev` branch +The proposed review process for `dev` branch is under active discussion. + +MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`. +
diff --git a/.github/scripts/oncall_manager.py b/.github/scripts/oncall_manager.py new file mode 100644 index 00000000000..332fcb1c8cc --- /dev/null +++ b/.github/scripts/oncall_manager.py @@ -0,0 +1,439 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import json +import requests +import argparse +from datetime import datetime, timedelta, timezone + +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError + +# Constants +GITHUB_API_URL = "https://api.github.com" +SCHEDULE_FILE = ".github/oncall_schedule.json" +ROTATION_TEAM_SLUG = "mcore-oncall-rotation" +ACTIVE_ONCALL_TEAM_SLUG = "mcore-oncall" +SLACK_USERGROUP_HANDLE = "mcore-oncall" +TARGET_WEEKS = 12 + +# Caches for email and Slack lookups +_email_cache = {} +_slack_id_cache = {} + +def get_headers(): + token = os.environ.get("GH_TOKEN") + if not token: + # Fallback to GITHUB_TOKEN if GH_TOKEN not set + token = os.environ.get("GITHUB_TOKEN") + + if not token: + print("Error: GH_TOKEN or GITHUB_TOKEN not set") + sys.exit(1) + + return { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json" + } + +def get_repo_info(): + """Returns (owner, repo) from GITHUB_REPOSITORY env var.""" + repo_env = os.environ.get("GITHUB_REPOSITORY") + if not repo_env: + print("Error: GITHUB_REPOSITORY environment variable not set") + sys.exit(1) + parts = repo_env.split("/") + return parts[0], parts[1] + +def get_team_members(org, team_slug): + """Fetches members of the GitHub team.""" + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members" + headers = get_headers() + + members = set() + page = 1 + while True: + resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers) + if resp.status_code != 200: + print(f"Error fetching team members: {resp.status_code} {resp.text}") + sys.exit(1) + + data = resp.json() + if not data: + break + + members.update([m['login'] for m in data]) + if len(data) < 100: + break + page += 1 + + return members + +def get_user_email(username): + """Get user's email from GitHub, prioritizing @nvidia.com emails. + + Checks in order: + 1. Public profile email + 2. Recent commits in the repository + """ + if username in _email_cache: + return _email_cache[username] + + headers = get_headers() + public_email = None + + try: + # 1. Try to get user's public profile email first + resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers) + if resp.status_code == 200: + user_data = resp.json() + email = user_data.get('email') + if email and not email.endswith("@users.noreply.github.com"): + if email.endswith("@nvidia.com"): + _email_cache[username] = email + return email + # Store non-nvidia email as fallback + public_email = email + + # 2. Check recent commits in the repository for @nvidia.com email + repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM") + commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10" + resp = requests.get(commits_url, headers=headers) + + if resp.status_code == 200: + commits = resp.json() + for commit in commits: + # Get email from commit author + commit_data = commit.get('commit', {}) + author_data = commit_data.get('author', {}) + email = author_data.get('email') + + if email and not email.endswith("@users.noreply.github.com"): + if email.endswith("@nvidia.com"): + _email_cache[username] = email + print(f"Found @nvidia.com email for {username} from commits: {email}") + return email + elif public_email is None: + public_email = email + + # 3. Use public email if found, otherwise fallback + if public_email: + _email_cache[username] = public_email + print(f"Using public email for {username}: {public_email}") + return public_email + + # Fallback to noreply email + fallback = f"{username}@users.noreply.github.com" + _email_cache[username] = fallback + print(f"Warning: No email found for {username}, using fallback: {fallback}") + return fallback + + except Exception as e: + print(f"Warning: Could not get email for {username}: {e}") + fallback = f"{username}@users.noreply.github.com" + _email_cache[username] = fallback + return fallback + +def get_slack_client(): + """Get Slack WebClient if token is available.""" + slack_token = os.environ.get("SLACK_TOKEN") + if not slack_token: + return None + + return WebClient(token=slack_token) + +def get_slack_user_id(slack_client, email): + """Get Slack user ID from email.""" + if not slack_client: + return None + + if email in _slack_id_cache: + return _slack_id_cache[email] + + try: + response = slack_client.users_lookupByEmail(email=email) + user_id = response["user"]["id"] + _slack_id_cache[email] = user_id + return user_id + except SlackApiError as e: + print(f"Warning: Could not find Slack user for {email}: {e.response['error']}") + _slack_id_cache[email] = None + return None + +def get_slack_usergroup_id(slack_client, handle): + """Get Slack usergroup ID from handle.""" + if not slack_client: + return None + + try: + response = slack_client.usergroups_list(include_users=True) + for usergroup in response.get("usergroups", []): + if usergroup.get("handle") == handle: + return usergroup.get("id"), usergroup.get("users", []) + print(f"Warning: Slack usergroup '{handle}' not found") + return None, [] + except SlackApiError as e: + print(f"Warning: Could not list Slack usergroups: {e.response['error']}") + return None, [] + +def update_slack_usergroup(new_oncall_username, old_members_usernames): + """ + Updates the Slack usergroup to contain only the new oncall user. + Adds new oncall first, then removes old members (usergroups need at least one member). + """ + slack_client = get_slack_client() + if not slack_client: + print("Slack token not configured, skipping Slack usergroup update") + return + + # Get the new oncall's email and Slack user ID + new_email = get_user_email(new_oncall_username) + new_slack_id = get_slack_user_id(slack_client, new_email) + + if not new_slack_id: + print(f"Could not find Slack user ID for {new_oncall_username} ({new_email}), skipping Slack update") + return + + # Get the usergroup ID and current members + usergroup_id, current_slack_members = get_slack_usergroup_id(slack_client, SLACK_USERGROUP_HANDLE) + + if not usergroup_id: + print(f"Could not find Slack usergroup '{SLACK_USERGROUP_HANDLE}', skipping Slack update") + return + + try: + # Step 1: Add new oncall first (include current members to avoid removing anyone yet) + # This ensures usergroup always has at least one member + if new_slack_id not in current_slack_members: + updated_members = list(set(current_slack_members + [new_slack_id])) + slack_client.usergroups_users_update( + usergroup=usergroup_id, + users=updated_members + ) + print(f"Added {new_oncall_username} to Slack usergroup '{SLACK_USERGROUP_HANDLE}'") + + # Step 2: Now set the usergroup to contain only the new oncall + slack_client.usergroups_users_update( + usergroup=usergroup_id, + users=[new_slack_id] + ) + print(f"Updated Slack usergroup '{SLACK_USERGROUP_HANDLE}' to contain only {new_oncall_username}") + + except SlackApiError as e: + print(f"Failed to update Slack usergroup: {e.response['error']}") + +def load_schedule(): + if not os.path.exists(SCHEDULE_FILE): + return [] + try: + with open(SCHEDULE_FILE, 'r') as f: + data = json.load(f) + # Normalize to list of dicts if it's a list of strings + schedule = [] + for item in data: + if isinstance(item, str): + schedule.append({"user": item, "date": "YYYY-MM-DD"}) + else: + schedule.append(item) + return schedule + except (json.JSONDecodeError, FileNotFoundError): + return [] + +def save_schedule(schedule): + with open(SCHEDULE_FILE, 'w') as f: + json.dump(schedule, f, indent=4) + f.write('\n') # trailing newline + +def update_active_oncall_team(org, new_oncall): + """Updates the active oncall team to contain only the new oncall user.""" + # 1. Get current members of the active team + current_members = get_team_members(org, ACTIVE_ONCALL_TEAM_SLUG) + + # 2. Add the new oncall if not present + if new_oncall not in current_members: + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{new_oncall}" + resp = requests.put(url, headers=get_headers()) + if resp.status_code == 200: + print(f"Added {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}") + else: + print(f"Failed to add {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}") + + # 3. Remove everyone else + old_members = [] + for member in current_members: + if member not in [new_oncall, 'svcnvidia-nemo-ci']: + old_members.append(member) + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{member}" + resp = requests.delete(url, headers=get_headers()) + if resp.status_code == 204: + print(f"Removed {member} from {ACTIVE_ONCALL_TEAM_SLUG}") + else: + print(f"Failed to remove {member} from {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}") + + # 4. Update Slack usergroup (add new oncall first, then remove old members) + update_slack_usergroup(new_oncall, old_members) + +def rotate_schedule(repo_owner, dry_run=False): + schedule = load_schedule() + print(f"Current schedule length: {len(schedule)}") + + # 1. Rotate (Remove past week) + # Only if schedule is not empty. + if schedule: + # Check date of first entry + first_entry = schedule[0] + try: + # We assume the date is the *start* of the oncall shift (Wednesday). + # The shift ends 7 days later. + start_date = datetime.strptime(first_entry['date'], "%Y-%m-%d").date() + end_date = start_date + timedelta(days=7) + + today = datetime.now(timezone.utc).date() + + # If today is >= end_date, the shift is over. + # (e.g. Started last Wed, ends today Wed. If today is Wed, we rotate) + if today >= end_date: + removed = schedule.pop(0) + print(f"Rotated out: {removed} (Ended {end_date})") + else: + print(f"First entry {first_entry} has not ended yet (Ends {end_date}). Not removing.") + except ValueError: + # Fallback if date is invalid, rotate anyway + removed = schedule.pop(0) + print(f"Rotated out (invalid date): {removed}") + else: + print("Schedule empty, nothing to rotate.") + + # 2. Replenish + ensure_schedule_filled(schedule, repo_owner) + + # 3. Update active oncall team + if schedule: + current_oncall = schedule[0]['user'] + print(f"New active oncall: {current_oncall}") + if not dry_run: + update_active_oncall_team(repo_owner, current_oncall) + else: + print(f"Dry run: Would update {ACTIVE_ONCALL_TEAM_SLUG} to contain only {current_oncall}") + + if not dry_run: + save_schedule(schedule) + print("Schedule updated and saved.") + else: + print("Dry run: Schedule not saved.") + print(json.dumps(schedule, indent=4)) + +def get_last_wednesday(): + today = datetime.now(timezone.utc).date() + # Monday=0, Wednesday=2 + offset = (today.weekday() - 2) % 7 + return today - timedelta(days=offset) + +def ensure_schedule_filled(schedule, repo_owner): + """Appends users to schedule until it reaches TARGET_WEEKS.""" + members = get_team_members(repo_owner, ROTATION_TEAM_SLUG) + if not members: + print(f"Warning: No team members found in {ROTATION_TEAM_SLUG}.") + return + if 'svcnvidia-nemo-ci' in members: + members.remove('svcnvidia-nemo-ci') + members = list(members) + + members.sort() # Deterministic order + + while len(schedule) < TARGET_WEEKS: + # Determine start date for the new entry + if not schedule: + # Start with the most recent Wednesday if list is empty + next_date = get_last_wednesday() + + # Start with the first member alphabetically if list is empty + next_user = members[0] + else: + last_entry = schedule[-1] + last_user = last_entry['user'] + + # Parse last date and add 7 days + try: + last_date = datetime.strptime(last_entry['date'], "%Y-%m-%d").date() + next_date = last_date + timedelta(days=7) + except ValueError: + # Fallback if date is invalid/placeholder + next_date = get_last_wednesday() + timedelta(days=7 * len(schedule)) + + try: + # Find index of last scheduled user in the team list + if last_user in members: + last_idx = members.index(last_user) + next_idx = (last_idx + 1) % len(members) + next_user = members[next_idx] + else: + # Last user not in team, just pick first member + next_user = members[0] + except ValueError: + next_user = members[0] + + new_entry = {"user": next_user, "date": next_date.strftime("%Y-%m-%d")} + schedule.append(new_entry) + print(f"Appended: {new_entry}") + +def assign_reviewer(pr_number): + """Assigns the mcore-oncall team as the reviewer for the PR.""" + owner, repo = get_repo_info() + url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers" + + # Assign the oncall team as reviewer + data = {"team_reviewers": [ACTIVE_ONCALL_TEAM_SLUG]} + resp = requests.post(url, headers=get_headers(), json=data) + + if resp.status_code in [201, 200]: + print(f"Successfully requested review from team NVIDIA/{ACTIVE_ONCALL_TEAM_SLUG}") + else: + print(f"Failed to request review: {resp.status_code} {resp.text}") + sys.exit(1) + +def main(): + parser = argparse.ArgumentParser(description="Manage Oncall Schedule") + subparsers = parser.add_subparsers(dest="command", required=True) + + # Rotate command + parser_rotate = subparsers.add_parser("rotate", help="Rotate the schedule (remove first, append new)") + parser_rotate.add_argument("--dry-run", action="store_true", help="Do not save changes") + + # Fill command (just fill up to 12 without rotating - useful for init) + parser_fill = subparsers.add_parser("fill", help="Fill the schedule to 12 weeks without rotating") + + # Assign command + parser_assign = subparsers.add_parser("assign", help="Assign current oncall to PR") + parser_assign.add_argument("--pr", type=int, required=True, help="PR number") + + args = parser.parse_args() + + owner, _ = get_repo_info() + + if args.command == "rotate": + rotate_schedule(owner, dry_run=args.dry_run) + elif args.command == "fill": + schedule = load_schedule() + ensure_schedule_filled(schedule, owner) + save_schedule(schedule) + print("Schedule filled and saved.") + elif args.command == "assign": + assign_reviewer(args.pr) + +if __name__ == "__main__": + main() + diff --git a/.github/scripts/readme.sh b/.github/scripts/readme.sh new file mode 100644 index 00000000000..216d5224a28 --- /dev/null +++ b/.github/scripts/readme.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +cat << 'EOF' +╔══════════════════════════════════════════════════════════════════════╗ +║ ║ +║ ███╗ ███╗██████╗ ██████╗ ██╗██████╗ ██████╗ ███████╗ ║ +║ ████╗ ████║██╔══██╗██╔══██╗██║██╔══██╗██╔════╝ ██╔════╝ ║ +║ ██╔████╔██║██████╔╝██████╔╝██║██║ ██║██║ ███╗█████╗ ║ +║ ██║╚██╔╝██║██╔══██╗██╔══██╗██║██║ ██║██║ ██║██╔══╝ ║ +║ ██║ ╚═╝ ██║██████╔╝██║ ██║██║██████╔╝╚██████╔╝███████╗ ║ +║ ╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝╚═╝╚═════╝ ╚═════╝ ╚══════╝ ║ +║ ║ +║ H O W T O : M B R I D G E T E S T I N G ║ +╚══════════════════════════════════════════════════════════════════════╝ + + MBridge unit tests run automatically on every PR. To also trigger + functional tests, attach the label and re-run the workflow step. + + ┌─────────────────────────────────────────────────────────────────┐ + │ DEFAULT │ Unit tests run on every PR (no action needed) │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ Every PR ──► cicd-mbridge-testing ──► unit tests only │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ STEP 1 │ Attach the label to your PR (for functional tests) │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ PR Labels ──► [ + Add label ] ──► "Run MBridge tests" │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ STEP 2 │ Re-run this workflow step │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ Actions ──► [ Re-run jobs ] ──► Re-run failed jobs │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌─────────────────────────────────────────────────────────────────┐ + │ RESULT │ Unit + functional tests run! │ + ├─────────────────────────────────────────────────────────────────┤ + │ │ + │ cicd-mbridge-testing ◄── unit + functional tests │ + │ │ + │ Tests run against MBridge using the merge commit │ + │ SHA of your pull request. │ + │ │ + └─────────────────────────────────────────────────────────────────┘ + + ┌────────────────────────────────────┐ + │ Label present? NO → unit │ + │ Label present? YES → unit + │ + │ functional│ + └────────────────────────────────────┘ + + NOTE: The label must be present BEFORE the re-run is triggered. + The CI checks for "Run MBridge tests" at runtime. + + NOTE: All MBridge test results are optional — failures do not + block merging your PR. +EOF diff --git a/.github/scripts/sync_team_usergroups.py b/.github/scripts/sync_team_usergroups.py new file mode 100644 index 00000000000..c5f40f5fe33 --- /dev/null +++ b/.github/scripts/sync_team_usergroups.py @@ -0,0 +1,596 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Syncs GitHub team membership to Slack user groups. + +This script reads members from GitHub teams and updates the corresponding +Slack user groups to match. +""" + +import os +import re +import sys +import argparse +import requests + +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError + +# Constants +GITHUB_API_URL = "https://api.github.com" + +# Teams whose *children* are each synced to their own Slack usergroup +PARENT_TEAM_SLUGS = ["mcore-reviewers"] + +# Teams synced directly (the team itself, not its children) +DIRECT_TEAM_SLUGS = ["mcore-engineers"] + +# Caches for email and Slack lookups +_email_cache = {} +_slack_id_cache = {} +_usergroups_cache = None + + +def get_headers(): + """Get GitHub API headers with authentication.""" + token = os.environ.get("GH_TOKEN") + if not token: + token = os.environ.get("GITHUB_TOKEN") + + if not token: + print("Error: GH_TOKEN or GITHUB_TOKEN not set") + sys.exit(1) + + return { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + + +def get_org(): + """Returns the organization from GITHUB_REPOSITORY env var or default.""" + repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM") + return repo_env.split("/")[0] + + +def github_team_to_slack_usergroup(team_slug): + """Convert a GitHub team slug to a Slack usergroup handle. + + Rules: + - Base pattern: "test" -> "mcore-test" + - Remove "core-" prefix: "core-test" -> "mcore-test" + - Remove "megatron-" prefix: "megatron-test" -> "mcore-test" + - Remove "-and-": "test1-and-test2" -> "mcore-test1-test2" + - Shorten "mixture-of-experts" to "moe" + - Shorten "pipeline-parallelism" to "pp" + - Shorten "reinforcement-learning" to "rl" + """ + name = team_slug + + # Apply shortenings first (before removing prefixes) + name = name.replace("mixture-of-experts", "moe") + name = name.replace("pipeline-parallelism", "pp") + name = name.replace("reinforcement-learning", "rl") + + # Remove prefixes + if name.startswith("core-"): + name = name[5:] # Remove "core-" + elif name.startswith("megatron-"): + name = name[9:] # Remove "megatron-" + elif name.startswith("mcore-"): + name = name[6:] # Remove "mcore-" + + # Remove "-and-" + name = name.replace("-and-", "-") + + return f"mcore-{name}" + + +def get_child_teams(org, parent_team_slug): + """Fetches child teams of a parent GitHub team.""" + # First get the team ID + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}" + headers = get_headers() + + resp = requests.get(url, headers=headers) + if resp.status_code != 200: + print(f"Error fetching parent team '{parent_team_slug}': {resp.status_code} {resp.text}") + return [] + + parent_team_id = resp.json().get("id") + if not parent_team_id: + print(f"Error: Could not get ID for team '{parent_team_slug}'") + return [] + + # Now fetch child teams + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{parent_team_slug}/teams" + child_teams = [] + page = 1 + + while True: + resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers) + if resp.status_code != 200: + print(f"Error fetching child teams: {resp.status_code} {resp.text}") + return child_teams + + data = resp.json() + if not data: + break + + child_teams.extend([team["slug"] for team in data]) + if len(data) < 100: + break + page += 1 + + return child_teams + + +def get_team_members(org, team_slug): + """Fetches members of the GitHub team.""" + url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members" + headers = get_headers() + + members = set() + page = 1 + while True: + resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers) + if resp.status_code == 404: + print(f"Warning: Team '{team_slug}' not found in org '{org}'") + return set() + if resp.status_code != 200: + print(f"Error fetching team members: {resp.status_code} {resp.text}") + return set() + + data = resp.json() + if not data: + break + + members.update([m["login"] for m in data]) + if len(data) < 100: + break + page += 1 + + return members + + +def get_user_email(username): + """Get user's email from GitHub, prioritizing @nvidia.com emails. + + Checks in order: + 1. Public profile email + 2. Recent commits in the repository + """ + if username in _email_cache: + return _email_cache[username] + + headers = get_headers() + public_email = None + + try: + # 1. Try to get user's public profile email first + resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers) + if resp.status_code == 200: + user_data = resp.json() + email = user_data.get('email') + if email and not email.endswith("@users.noreply.github.com"): + if email.endswith("@nvidia.com"): + _email_cache[username] = email + return email + # Store non-nvidia email as fallback + public_email = email + + # 2. Check recent commits in the repository for @nvidia.com email + repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM") + commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10" + resp = requests.get(commits_url, headers=headers) + + if resp.status_code == 200: + commits = resp.json() + for commit in commits: + commit_data = commit.get('commit', {}) + + # Get email from commit author metadata + author_data = commit_data.get('author', {}) + email = author_data.get('email') + + if email and not email.endswith("@users.noreply.github.com"): + if email.endswith("@nvidia.com"): + _email_cache[username] = email + print(f"Found @nvidia.com email for {username} from commits") + return email + elif public_email is None: + public_email = email + + # Check Signed-off-by lines in the commit message for @nvidia.com emails + message = commit_data.get('message', '') + sob_matches = re.findall( + r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message + ) + if sob_matches: + _email_cache[username] = sob_matches[0] + print(f"Found @nvidia.com email for {username} from Signed-off-by") + return sob_matches[0] + + # 3. Use public email if found, otherwise fallback + if public_email: + _email_cache[username] = public_email + print(f"Using public email for {username}: {public_email}") + return public_email + + # Fallback to noreply email + fallback = f"{username}@users.noreply.github.com" + _email_cache[username] = fallback + print(f"Warning: No email found for {username}, using fallback: {fallback}") + return fallback + + except Exception as e: + print(f"Warning: Could not get email for {username}: {e}") + fallback = f"{username}@users.noreply.github.com" + _email_cache[username] = fallback + return fallback + + +def get_slack_client(): + """Get Slack WebClient if token is available.""" + slack_token = os.environ.get("SLACK_TOKEN") + if not slack_token: + return None + + return WebClient(token=slack_token) + + +def get_slack_user_id(slack_client, email): + """Get Slack user ID from email.""" + if not slack_client: + return None + + if email in _slack_id_cache: + return _slack_id_cache[email] + + try: + response = slack_client.users_lookupByEmail(email=email) + user_id = response["user"]["id"] + _slack_id_cache[email] = user_id + return user_id + except SlackApiError as e: + print(f"Warning: Could not find Slack user for {email}: {e.response['error']}") + _slack_id_cache[email] = None + return None + + +def fetch_all_usergroups(slack_client): + """Fetch all Slack usergroups once and cache them.""" + global _usergroups_cache + + if _usergroups_cache is not None: + return _usergroups_cache + + if not slack_client: + _usergroups_cache = {} + return _usergroups_cache + + try: + print("Fetching Slack usergroups...") + response = slack_client.usergroups_list(include_users=True) + _usergroups_cache = {} + for usergroup in response.get("usergroups", []): + handle = usergroup.get("handle") + if handle: + _usergroups_cache[handle] = { + "id": usergroup.get("id"), + "users": usergroup.get("users", []), + } + print(f"Fetched {len(_usergroups_cache)} usergroups") + return _usergroups_cache + except SlackApiError as e: + print(f"Warning: Could not list Slack usergroups: {e.response['error']}") + _usergroups_cache = {} + return _usergroups_cache + + +def get_slack_usergroup_id(slack_client, handle): + """Get Slack usergroup ID from handle.""" + usergroups = fetch_all_usergroups(slack_client) + + if handle in usergroups: + return usergroups[handle]["id"], usergroups[handle]["users"] + + return None, [] + + +def github_team_to_usergroup_name(team_slug): + """Convert a GitHub team slug to a Slack usergroup display name. + + Example: "test3" -> "Megatron Core Experts: Test3" + """ + # Title case each word separated by hyphens, then join with spaces + words = team_slug.split("-") + title_cased = " ".join(word.capitalize() for word in words) + return f"Megatron Core Experts: {title_cased}" + + +def create_slack_usergroup(slack_client, handle, team_slug): + """Create a new Slack usergroup. + + Args: + slack_client: Slack WebClient instance + handle: The usergroup handle (e.g., "mcore-test") + team_slug: The GitHub team slug (used for name and description) + + Returns: + The usergroup ID if created successfully, None otherwise + """ + global _usergroups_cache + + name = github_team_to_usergroup_name(team_slug) + description = f'Expert review group "{team_slug}"' + + try: + print(f"Creating Slack usergroup '@{handle}' with name '{name}'...") + response = slack_client.usergroups_create( + name=name, + handle=handle, + description=description, + ) + usergroup = response.get("usergroup", {}) + usergroup_id = usergroup.get("id") + + if usergroup_id: + # Update cache with new usergroup + if _usergroups_cache is not None: + _usergroups_cache[handle] = { + "id": usergroup_id, + "users": [], + } + print(f"Successfully created Slack usergroup '@{handle}'") + return usergroup_id + else: + print(f"Error: Usergroup created but no ID returned") + return None + + except SlackApiError as e: + print(f"Error creating Slack usergroup '@{handle}': {e.response['error']}") + return None + + +def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False): + """Sync a GitHub team to a Slack usergroup.""" + print(f"\n{'='*60}") + print(f"Syncing GitHub team '{team_slug}' -> Slack usergroup '@{usergroup_handle}'") + print(f"{'='*60}") + + org = get_org() + slack_client = get_slack_client() + + if not slack_client: + print("Error: Slack token not configured") + return False + + # 1. Get GitHub team members + members = get_team_members(org, team_slug) + if not members: + print(f"No members found in GitHub team '{team_slug}'") + return False + + # Filter out service accounts + members = {m for m in members if not m.startswith("svc")} + print(f"GitHub team members ({len(members)}): {sorted(members)}") + + # 2. Get Slack user IDs for each member + slack_user_ids = [] + missing_users = [] + + for username in sorted(members): + email = get_user_email(username) + slack_id = get_slack_user_id(slack_client, email) + if slack_id: + slack_user_ids.append(slack_id) + else: + missing_users.append((username, email, "not found in Slack")) + + if missing_users: + print(f"\nWarning: Could not resolve {len(missing_users)} users:") + for username, email, reason in missing_users: + print(f" - {username}: {reason}" + (f" (tried {email})" if email else "")) + + if not slack_user_ids: + print(f"Error: No Slack users found for team '{team_slug}'") + return False + + # 3. Get current Slack usergroup membership (or create if it doesn't exist) + usergroup_id, current_members = get_slack_usergroup_id(slack_client, usergroup_handle) + + if not usergroup_id: + print(f"Slack usergroup '@{usergroup_handle}' not found, creating it...") + if dry_run: + print(f"Dry run: Would create usergroup '@{usergroup_handle}'") + current_members = [] + else: + usergroup_id = create_slack_usergroup(slack_client, usergroup_handle, team_slug) + if not usergroup_id: + print(f"Error: Failed to create Slack usergroup '@{usergroup_handle}'") + return False + current_members = [] + + # 4. Compare and update + current_set = set(current_members) + new_set = set(slack_user_ids) + + to_add = new_set - current_set + to_remove = current_set - new_set + + print(f"\nCurrent usergroup members: {len(current_members)}") + print(f"New members to set: {len(slack_user_ids)}") + print(f" Adding: {len(to_add)} users") + print(f" Removing: {len(to_remove)} users") + + if current_set == new_set: + print("No changes needed - usergroup is already in sync") + return True + + if dry_run: + print(f"\nDry run: Would update '@{usergroup_handle}' with {len(slack_user_ids)} members") + return True + + # 5. Update the usergroup + try: + slack_client.usergroups_users_update( + usergroup=usergroup_id, users=slack_user_ids + ) + print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members") + return True + except SlackApiError as e: + print(f"Error updating usergroup: {e.response['error']}") + return False + + +def get_team_to_usergroup_mapping(parent_team_slug): + """Fetch child teams of a parent team and generate the mapping.""" + org = get_org() + child_teams = get_child_teams(org, parent_team_slug) + + if not child_teams: + print(f"Error: No child teams found under '{parent_team_slug}'") + return {} + + mapping = {} + for team_slug in child_teams: + usergroup_handle = github_team_to_slack_usergroup(team_slug) + mapping[team_slug] = usergroup_handle + + return mapping + + +def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None): + """Sync GitHub teams to their Slack usergroups. + + Args: + parent_teams: List of team slugs whose *children* are each synced. + Defaults to PARENT_TEAM_SLUGS. + direct_teams: List of team slugs synced directly (not their children). + Defaults to DIRECT_TEAM_SLUGS. + """ + if parent_teams is None: + parent_teams = PARENT_TEAM_SLUGS + if direct_teams is None: + direct_teams = DIRECT_TEAM_SLUGS + + team_to_usergroup = {} + + for parent_slug in parent_teams: + print(f"Fetching child teams of '{parent_slug}'...") + mapping = get_team_to_usergroup_mapping(parent_slug) + team_to_usergroup.update(mapping) + + for team_slug in direct_teams: + usergroup_handle = github_team_to_slack_usergroup(team_slug) + team_to_usergroup[team_slug] = usergroup_handle + + if not team_to_usergroup: + return False + + print(f"Found {len(team_to_usergroup)} teams to sync") + print("\nTeam to usergroup mapping:") + for team, usergroup in sorted(team_to_usergroup.items()): + print(f" {team} -> @{usergroup}") + + results = {"success": [], "failed": []} + + for team_slug, usergroup_handle in team_to_usergroup.items(): + success = sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=dry_run) + if success: + results["success"].append(team_slug) + else: + results["failed"].append(team_slug) + + # Summary + print(f"\n{'='*60}") + print("SYNC SUMMARY") + print(f"{'='*60}") + print(f"Successful: {len(results['success'])}") + print(f"Failed: {len(results['failed'])}") + + if results["failed"]: + print(f"\nFailed teams: {', '.join(results['failed'])}") + return False + + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Sync GitHub team membership to Slack user groups" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be done without making changes", + ) + parser.add_argument( + "--list", + action="store_true", + help="List all configured team-to-usergroup mappings", + ) + parser.add_argument( + "--parent-team", + action="append", + dest="parent_teams", + metavar="SLUG", + help=( + "Sync all children of this GitHub team (can be repeated). " + f"Defaults to: {PARENT_TEAM_SLUGS}" + ), + ) + parser.add_argument( + "--team", + action="append", + dest="direct_teams", + metavar="SLUG", + help=( + "Sync this GitHub team directly (can be repeated). " + f"Defaults to: {DIRECT_TEAM_SLUGS}" + ), + ) + + args = parser.parse_args() + + # Use CLI values when provided, otherwise fall back to module-level defaults + parent_teams = args.parent_teams if args.parent_teams is not None else PARENT_TEAM_SLUGS + direct_teams = args.direct_teams if args.direct_teams is not None else DIRECT_TEAM_SLUGS + + if args.list: + team_to_usergroup = {} + for parent_slug in parent_teams: + print(f"Fetching child teams of '{parent_slug}'...") + team_to_usergroup.update(get_team_to_usergroup_mapping(parent_slug)) + for team_slug in direct_teams: + team_to_usergroup[team_slug] = github_team_to_slack_usergroup(team_slug) + if not team_to_usergroup: + sys.exit(1) + print("\nTeam-to-usergroup mappings:") + print(f"{'GitHub Team':<35} {'Slack Usergroup':<30}") + print("-" * 65) + for team, usergroup in sorted(team_to_usergroup.items()): + print(f"{team:<35} @{usergroup:<29}") + return + + success = sync_all_teams( + dry_run=args.dry_run, parent_teams=parent_teams, direct_teams=direct_teams + ) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml new file mode 100644 index 00000000000..0df8756d082 --- /dev/null +++ b/.github/workflows/_build_test_publish_wheel.yml @@ -0,0 +1,196 @@ +on: + workflow_call: + inputs: + ref: + required: false + description: Ref (SHA or branch) to release + type: string + default: ${{ github.sha }} + dry-run: + required: false + description: Upload to PyPy Test instance + type: boolean + default: true + no-publish: + required: false + description: Do not publish the wheel + type: boolean + default: true + secrets: + TWINE_PASSWORD: + required: false + +jobs: + build-and-test-wheels: + strategy: + fail-fast: false + matrix: + include: + - PACKAGE: megatron-core + PLATFORM: arm64 + IMAGE: quay.io/pypa/manylinux_2_28_aarch64 + - PACKAGE: megatron-core + PLATFORM: amd64 + IMAGE: quay.io/pypa/manylinux_2_28_x86_64 + - PACKAGE: megatron-fsdp + IMAGE: quay.io/pypa/manylinux_2_28_x86_64 + PLATFORM: amd64 + runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }} + env: + PACKAGE: ${{ matrix.PACKAGE }} + IMAGE: ${{ matrix.IMAGE }} + PLATFORM: ${{ matrix.PLATFORM }} + PUBLISH_DRYRUN: ${{ inputs.dry-run }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref }} + + - name: Build wheel + id: build-wheel + env: + NO_VCS_VERSION: "1" + run: | + set -x + + if [ "$PACKAGE" = "megatron-core" ]; then + ROOTDIR="megatron/core" + BUILD_DIR="." + elif [ "$PACKAGE" = "megatron-fsdp" ]; then + ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp" + BUILD_DIR="megatron/core/distributed/fsdp/src" + else + echo Unknown package: $PACKAGE + exit 1 + fi + + if [ "$PUBLISH_DRYRUN" = "true" ]; then + PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py) + sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py + fi + + pushd $BUILD_DIR + rm LICENSE || true + docker run --rm -e NO_VCS_VERSION=1 -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\ + for python_version in cp311 cp312 cp313; do \ + /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools>=80" build; \ + done && \ + for python_version in cp311 cp312 cp313; do \ + /opt/python/${python_version}-${python_version}/bin/python -m build; \ + done \ + ' + + PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl") + if [ -n "$PLATFORM_WHEELS" ]; then + echo "Found platform wheels to repair: $PLATFORM_WHEELS" + docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS + docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl + docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/ + fi + popd + + pushd $ROOTDIR + EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)") + popd + + echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}" + + if [ "$PACKAGE" = "megatron-fsdp" ]; then + mkdir -p dist/ + cp -a megatron/core/distributed/fsdp/src/dist/* dist/ + fi + + ls -al dist/ + + - name: Test wheels + run: | + ls -al dist/ + + if [ "$PACKAGE" = "megatron-core" ]; then + ROOTPATH="megatron.core" + WHEEL_PREFIX="megatron_core" + elif [ "$PACKAGE" = "megatron-fsdp" ]; then + ROOTPATH="megatron_fsdp" + WHEEL_PREFIX="megatron_fsdp" + else + echo Unknown package: $PACKAGE + exit 1 + fi + + if [ "$PACKAGE" = "megatron-core" ]; then + if [[ "$PLATFORM" == "arm64" ]]; then + WHEEL_GLOB="dist/${WHEEL_PREFIX}*cp312*aarch64.whl" + else + WHEEL_GLOB="dist/${WHEEL_PREFIX}*cp312*x86_64.whl" + fi + else + WHEEL_GLOB="dist/${WHEEL_PREFIX}*.whl" + fi + + docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c "\ + /opt/python/cp312-cp312/bin/pip install --no-cache-dir $WHEEL_GLOB && \ + rm -rf megatron/ && \ + RELEASE_NUMBER=\$(/opt/python/cp312-cp312/bin/python -c 'import $ROOTPATH; print($ROOTPATH.__version__)') && \ + test '${{ steps.build-wheel.outputs.expected-release-number }}' == \"\$RELEASE_NUMBER\" \ + " + + - name: Upload wheels + uses: actions/upload-artifact@v6 + with: + name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }} + path: dist/ + + publish-wheels: + needs: [build-and-test-wheels] + runs-on: ubuntu-latest + environment: + name: ${{ inputs.no-publish && 'public' || 'main' }} + strategy: + fail-fast: false + matrix: + include: + - PACKAGE: megatron-core + PLATFORM: arm64 + - PACKAGE: megatron-core + PLATFORM: amd64 + - PACKAGE: megatron-fsdp + PLATFORM: amd64 + env: + PACKAGE: ${{ matrix.PACKAGE }} + steps: + - name: Download wheels + uses: actions/download-artifact@v7 + with: + name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }} + path: dist/ + merge-multiple: true + + - name: Publish wheels + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }} + PLATFORM: ${{ matrix.PLATFORM }} + DRY_RUN: ${{ inputs.no-publish }} + run: | + + # Delete sdist for arm64 since we already upload it with amd64. + if [ "$PLATFORM" == "arm64" ]; then + rm dist/*.tar.gz + fi + + ls -al dist/ + pip install twine + + if [[ "$DRY_RUN" == "false" ]]; then + [[ -z "$TWINE_PASSWORD" ]] && { echo "::error::TWINE_PASSWORD unset"; exit 1; } + twine upload \ + --verbose \ + -r $TWINE_REPOSITORY \ + -u $TWINE_USERNAME \ + -p $TWINE_PASSWORD \ + dist/* + else + echo "[dry-run] would execute: twine upload --verbose -r $TWINE_REPOSITORY -u -p dist/*" + fi diff --git a/.github/workflows/_update_dependencies.yml b/.github/workflows/_update_dependencies.yml new file mode 100644 index 00000000000..b8410f8fc00 --- /dev/null +++ b/.github/workflows/_update_dependencies.yml @@ -0,0 +1,228 @@ +name: ~Update dependencies template +on: + workflow_call: + inputs: + target-branch: + required: true + type: string + description: "The target branch to bump" + secrets: + PAT: + required: true + SSH_KEY: + required: true + SSH_PWD: + required: true + +jobs: + pre-flight: + runs-on: ubuntu-latest + outputs: + bump-branch: bump-ci-container-${{ steps.ref.outputs.date }}-${{ inputs.target-branch }} + date: ${{ steps.ref.outputs.date }} + steps: + - name: Get date + id: ref + run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT" + + update-lockfile: + runs-on: linux-amd64-cpu16 + needs: [pre-flight] + env: + SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }} + TARGET_BRANCH: ${{ inputs.target-branch }} + steps: + - name: Checkout repo + uses: actions/checkout@v6 + with: + ref: ${{ env.TARGET_BRANCH }} + + - name: Mock test data + run: mkdir -p assets/ + + - name: Fetch NGC Version + id: ngc-version + run: | + NGC_VERSION=$(cat docker/.ngc_version.dev) + echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT" + + - name: Build container + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core . + + - name: Create bump branch if not exists + run: | + if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then + git checkout -b $SOURCE_BRANCH $TARGET_BRANCH + git push origin $SOURCE_BRANCH + fi + + - name: Checkout repo + uses: actions/checkout@v6 + with: + ref: ${{ env.SOURCE_BRANCH }} + + - name: Upgrade lock file + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + docker run \ + --rm \ + -v $(pwd):/workspace \ + -w /workspace \ + -e GH_TOKEN=${{ secrets.PAT }} \ + megatron-core \ + bash -c 'uv lock --upgrade' + + - name: Upload lock file + uses: actions/upload-artifact@v6 + with: + name: lock-file-${{ env.SOURCE_BRANCH }} + path: uv.lock + + create-pr: + needs: [update-lockfile, pre-flight] + runs-on: ubuntu-latest + env: + SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }} + TARGET_BRANCH: ${{ inputs.target-branch }} + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + token: ${{ secrets.PAT }} + ref: ${{ env.TARGET_BRANCH }} + + - name: Rebase against ${{ env.SOURCE_BRANCH }} + run: | + if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then + git fetch origin ${{ env.SOURCE_BRANCH }} + git rebase -S origin/${{ env.SOURCE_BRANCH }} + fi + + - name: Download lock file + uses: actions/download-artifact@v7 + with: + name: lock-file-${{ env.SOURCE_BRANCH }} + + - name: Create Bump PR + uses: peter-evans/create-pull-request@v8 + id: create-pull-request + env: + title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})" + with: + branch: ${{ env.SOURCE_BRANCH }} + base: ${{ env.TARGET_BRANCH }} + title: ${{ env.title }} + token: ${{ secrets.PAT }} + labels: Run functional tests + body: | + 🚀 PR to bump `uv.lock` in `${{ inputs.target-branch }}`. + + 📝 Please remember the following to-do's before merge: + - [ ] Verify the presubmit CI + + 🙏 Please merge this PR only if the CI workflow completed successfully. + commit-message: ${{ env.title }} + signoff: true + committer: "github-actions[bot] " + + - name: Post /ok to test comment + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping comment" + exit 0 + fi + SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}" + gh pr comment "$PR_NUMBER" --body "/ok to test $SHA" + + - name: Wait for CI checks + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping wait" + exit 0 + fi + + # Fetch required status checks from branch protection rules + REQUIRED_CHECKS=$(gh api \ + "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \ + --jq '.checks[].context' 2>/dev/null \ + || gh api \ + "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \ + --jq '.contexts[]' 2>/dev/null \ + || true) + + if [ -z "$REQUIRED_CHECKS" ]; then + echo "No branch protection rules found for ${{ env.TARGET_BRANCH }}, skipping wait" + exit 0 + fi + + echo "Required checks from branch protection:" + echo "$REQUIRED_CHECKS" + + echo "Waiting for required checks to complete on PR #$PR_NUMBER..." + i=0 + INITIALIZED=false + while true; do + i=$((i + 1)) + CHECKS_JSON=$(gh pr checks "$PR_NUMBER" --json name,state 2>/dev/null || echo "[]") + ALL_DONE=true + FAILED_CHECKS="" + while IFS= read -r check; do + CHECK_STATE=$(echo "$CHECKS_JSON" | jq -r --arg name "$check" '.[] | select(.name == $name) | .state // ""' | tr '[:upper:]' '[:lower:]') + case "$CHECK_STATE" in + *success*|*pass*|*skip*|*neutral*) ;; + *pending*|*queued*|*progress*|*waiting*|*request*|"") + ALL_DONE=false + INITIALIZED=true + break + ;; + *) + if [ "$INITIALIZED" = "true" ]; then + FAILED_CHECKS="${FAILED_CHECKS} - ${check} (${CHECK_STATE})"$'\n' + else + ALL_DONE=false + fi + ;; + esac + done <<< "$REQUIRED_CHECKS" + if [ "$ALL_DONE" = "true" ]; then + if [ -n "$FAILED_CHECKS" ]; then + echo "Required check(s) did not pass:" + echo "$FAILED_CHECKS" + exit 1 + fi + echo "All required checks passed!" + break + fi + echo "Checks not yet complete (attempt $i), retrying in 30s..." + sleep 30 + done + + - name: Merge PR + env: + title: "chore(beep boop 🤖): Bump `uv.lock` (${{ env.TARGET_BRANCH}}) (${{ needs.pre-flight.outputs.date }})" + run: | + PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" + if [ -z "$PR_NUMBER" ]; then + echo "No PR was created, skipping merge" + exit 0 + fi + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git fetch origin ${{ env.SOURCE_BRANCH }} + git fetch origin ${{ env.TARGET_BRANCH }} + git checkout ${{ env.TARGET_BRANCH }} + git merge --squash origin/${{ env.SOURCE_BRANCH }} + git commit -m "${{ env.title }}" + git pull --rebase origin ${{ env.TARGET_BRANCH }} + git push origin ${{ env.TARGET_BRANCH }} + git push origin --delete ${{ env.SOURCE_BRANCH }} diff --git a/.github/workflows/auto-assign-milestone.yml b/.github/workflows/auto-assign-milestone.yml new file mode 100644 index 00000000000..b972329bac1 --- /dev/null +++ b/.github/workflows/auto-assign-milestone.yml @@ -0,0 +1,74 @@ +name: Auto-assign Milestone to PR + +on: + push: + branches: + - "pull-request/[0-9]+" + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + assign-milestone: + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Check if PR has milestone + id: check_milestone + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ + --repo ${{ github.repository }} \ + --json milestone \ + --jq '.milestone.title') + + if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then + echo "has_milestone=false" >> $GITHUB_OUTPUT + else + echo "has_milestone=true" >> $GITHUB_OUTPUT + echo "PR already has milestone: $MILESTONE" + fi + + - name: Get most recent open milestone + if: steps.check_milestone.outputs.has_milestone == 'false' + id: get_milestone + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + # Get the most recent open milestone (sorted by due date, then by creation date) + MILESTONE_NUMBER=$(gh api \ + "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ + --jq '.[0].number') + + MILESTONE_TITLE=$(gh api \ + "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ + --jq '.[0].title') + + if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then + echo "No open milestones found" + echo "milestone_found=false" >> $GITHUB_OUTPUT + else + echo "milestone_found=true" >> $GITHUB_OUTPUT + echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT + echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT + echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)" + fi + + - name: Assign milestone to PR + if: steps.check_milestone.outputs.has_milestone == 'false' && steps.get_milestone.outputs.milestone_found == 'true' + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ + --repo ${{ github.repository }} \ + --milestone "${{ steps.get_milestone.outputs.milestone_title }}" + + echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}" diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml new file mode 100644 index 00000000000..72a48e9539e --- /dev/null +++ b/.github/workflows/auto-reminder-bot.yml @@ -0,0 +1,33 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Auto Reminder Bot + +on: + workflow_dispatch: + schedule: + - cron: "0 12 * * *" + +jobs: + run-script: + name: Run Auto Reminder Bot + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Check out repository code + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + pip install --no-cache-dir PyGithub slack-sdk + + - name: Run Auto Reminder Bot + run: | + export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }} + export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }} + export GH_TOKEN=${{ secrets.PAT }} + python tests/test_utils/python_scripts/auto_reminder_github.py diff --git a/.github/workflows/auto-swap-labels.yml b/.github/workflows/auto-swap-labels.yml new file mode 100644 index 00000000000..d38fb65d210 --- /dev/null +++ b/.github/workflows/auto-swap-labels.yml @@ -0,0 +1,75 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Auto Swap Labels +on: + pull_request_target: + types: [ready_for_review, synchronize] + branches: + - main + workflow_run: + workflows: ["Review Trigger"] + types: [completed] + +permissions: + pull-requests: write + contents: read + actions: read + +jobs: + check-approval: + runs-on: ubuntu-latest + if: >- + github.repository == 'NVIDIA/Megatron-LM' && ( + (github.event_name == 'pull_request_target' && + github.event.pull_request.base.ref == 'main' && + !github.event.pull_request.draft) || + (github.event_name == 'workflow_run' && + github.event.workflow_run.conclusion == 'success') + ) + + steps: + - name: Get PR number from workflow_run + id: get-pr + if: github.event_name == 'workflow_run' + continue-on-error: true + uses: actions/download-artifact@v7 + with: + name: pr-number + path: pr-number + github-token: ${{ github.token }} + run-id: ${{ github.event.workflow_run.id }} + + - name: Set PR number + id: pr + run: | + if [ "${{ github.event_name }}" = "workflow_run" ]; then + if [ "${{ steps.get-pr.outcome }}" != "success" ]; then + echo "No approval artifact found — review was not an approval. Skipping." + exit 0 + fi + echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT + else + echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT + fi + + - name: Check out repository code + if: steps.pr.outputs.number + uses: actions/checkout@v6 + + - name: Set up Python + if: steps.pr.outputs.number + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Install dependencies + if: steps.pr.outputs.number + run: | + pip install --no-cache-dir PyGithub slack-sdk + + - name: Run Auto Swap Labels + if: steps.pr.outputs.number + run: | + export GH_TOKEN=${{ secrets.PAT }} + export PR_NUMBER=${{ steps.pr.outputs.number }} + python tests/test_utils/python_scripts/swap_pr_labels.py diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml new file mode 100644 index 00000000000..07fdcfbfbb8 --- /dev/null +++ b/.github/workflows/auto-update-copy-pr-bot.yml @@ -0,0 +1,65 @@ +name: Auto Update Copy PR Bot + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +jobs: + auto-update-copy-pr-bot: + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + token: ${{ secrets.PAT }} + ref: main + + - name: Fetch list of members in mcore-reviewers team + shell: bash -euxo pipefail {0} + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + #!/bin/bash + + get_members() { + local org=$1 team=$2 seen_file=$3 + + gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file" + + gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do + get_members "$org" "$child" "$seen_file" + done + + cat "$seen_file" + } + + tmp=$(mktemp) + echo "" > final.txt + get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp" + + tmp=$(mktemp) + get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp" + + cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique' + + export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique') + yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new + + mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml + + - name: Commit changes + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git + git config --global user.name "GitHub Actions" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add .github/copy-pr-bot.yaml + if git diff --cached --exit-code --quiet; then + echo "No changes to commit. Exiting gracefully." + exit 0 + fi + git commit -m "Update copy-pr-bot.yaml [skip ci]" + git push -u origin main diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 00000000000..f77e665d22f --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,65 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build docs + +on: + push: + branches: + - main + - "pull-request/[0-9]+" + - "deploy-release/*" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + pre-flight: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v1.0.0 + + build-docs: + needs: [pre-flight] + if: needs.pre-flight.outputs.is_deployment_workflow != 'true' + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v1.0.0 + + build-docs-summary: + needs: [pre-flight, build-docs] + if: | + ( + needs.pre-flight.outputs.is_deployment_workflow == 'true' + || always() + ) + && !cancelled() + runs-on: ubuntu-latest + steps: + - name: Get workflow result + id: result + shell: bash -x -e -u -o pipefail {0} + env: + GH_TOKEN: ${{ github.token }} + RUN_ID: ${{ github.run_id }} + SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }} + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 + + if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + # Show which jobs failed + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' + exit 1 + fi diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml new file mode 100644 index 00000000000..9da305f07e6 --- /dev/null +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Create PR to main with cherry-pick from release + +on: + push: + branches: + - main + +jobs: + cherry-pick: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9 + if: github.repository == 'NVIDIA/Megatron-LM' + with: + target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+' + secrets: + PAT: ${{ secrets.PAT }} + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }} + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml new file mode 100644 index 00000000000..32b82a66e19 --- /dev/null +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -0,0 +1,288 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Approve Test Queue + +on: + schedule: + - cron: "*/5 * * * *" # Runs every 5 minutes + workflow_dispatch: # Allows manual triggering + +jobs: + approve-queue: + runs-on: ubuntu-latest + environment: main + if: github.repository == 'NVIDIA/Megatron-LM' + strategy: + matrix: + branch: [main, dev, others] + contributor_type: [internal, external] + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests + + - name: Download SSO users list + run: | + gh release download v0.1.0 \ + --repo NVIDIA-GitHub-Management/github-audits \ + --pattern users_sso.json \ + --output users_sso.json || echo '{}' > users_sso.json + env: + GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} + + - name: Approve waiting deployments + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }} + MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 1 }} + CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }} + SSO_USERS_FILE: users_sso.json + PYTHONUNBUFFERED: 1 + shell: python + run: | + import os + import json + import requests + import re + import time + + # GitHub API configuration + GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] + REPO = os.environ["GITHUB_REPOSITORY"] + CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"] + if CONTRIBUTOR_TYPE == "external": + # Global limit across all branches — no division needed since we count globally. + MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"]) + else: + MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2 + API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM" + + # Load SSO users for internal/external classification + with open(os.environ["SSO_USERS_FILE"]) as f: + sso_users = json.load(f) + + # Headers for GitHub API + headers = { + "Authorization": f"token {GITHUB_TOKEN}", + "Accept": "application/vnd.github.v3+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + def make_request(endpoint, method="GET", data=None, max_retries=5): + """Make a request to the GitHub API with retry on transient errors.""" + url = f"{API_BASE}/{endpoint}" + for attempt in range(max_retries): + try: + if method == "GET": + response = requests.get(url, headers=headers, timeout=30) + else: + response = requests.post(url, headers=headers, json=data, timeout=30) + if response.status_code == 429: + retry_after = int(response.headers.get("Retry-After", 2 ** attempt)) + print(f"Rate limited on {endpoint}, retrying in {retry_after}s (attempt {attempt + 1}/{max_retries})") + time.sleep(retry_after) + continue + if response.status_code >= 500: + delay = 2 ** attempt + print(f"Server error {response.status_code} on {endpoint}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})") + time.sleep(delay) + continue + response.raise_for_status() + return response.json() + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: + delay = 2 ** attempt + print(f"Transient error on {endpoint}: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})") + time.sleep(delay) + except requests.exceptions.RequestException as e: + print(f"Error making request to {endpoint}: {str(e)}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + return None + print(f"Max retries ({max_retries}) exceeded for {endpoint}") + return None + + def is_internal_contributor(pr_info): + """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member).""" + login = pr_info.get("user", {}).get("login", "") + org_roles = sso_users.get(login, {}).get("org_roles", []) + return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles) + + def get_pr_base_branch(workflow_run): + """ + Return the base branch of the PR associated with a workflow run, or None. + Extracts PR number from head branch like 'pull-request/1913' and fetches PR info. + Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run. + """ + print(workflow_run.get("head_branch", "")) + head_branch = workflow_run.get("head_branch", "") + match = re.match(r"pull-request/(\d+)", head_branch) + if not match: + return None, None # Not a PR branch pattern + + pr_number = int(match.group(1)) + + # Fetch PR info from GitHub API + pr_info = make_request(f"pulls/{pr_number}") + if not pr_info: + print(f"Failed to fetch PR #{pr_number}") + return None, None + + base_branch = pr_info.get("base", {}).get("ref") + return base_branch, pr_info + + def matches_contributor(workflow_run, contributor_type): + """Return True if the workflow run matches the contributor type (ignores branch).""" + _, pr_info = get_pr_base_branch(workflow_run) + if pr_info is None: + return False + internal = is_internal_contributor(pr_info) + return (contributor_type == "internal") == internal + + def matches_queue(workflow_run, target_branch, contributor_type): + """ + Return True if the workflow run belongs to this queue cell: + matching target branch AND matching contributor type (internal/external). + """ + base_branch, pr_info = get_pr_base_branch(workflow_run) + if base_branch is None: + return False + + branch_match = ( + (base_branch == target_branch) or + (base_branch != "main" and base_branch != "dev" and target_branch == "others") + ) + if not branch_match: + return False + + pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1) + internal = is_internal_contributor(pr_info) + contributor_match = (contributor_type == "internal") == internal + if branch_match and contributor_match: + print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})") + return branch_match and contributor_match + + # Get current running and queued workflows + print("Fetching workflow runs...") + queued_resp = make_request("actions/runs?status=queued") + if queued_resp is None: + print("Failed to fetch queued workflow runs after retries, exiting") + exit(1) + queued_workflow_runs = queued_resp.get("workflow_runs", []) + in_progress_resp = make_request("actions/runs?status=in_progress") + if in_progress_resp is None: + print("Failed to fetch in-progress workflow runs after retries, exiting") + exit(1) + in_progress_workflow_runs = in_progress_resp.get("workflow_runs", []) + + # For external contributors, enforce a single global concurrency limit across ALL branches. + # For internal contributors, enforce per-branch limits as before. + if CONTRIBUTOR_TYPE == "external": + queued_workflow_runs = [run for run in queued_workflow_runs + if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)] + in_progress_workflow_runs = [run for run in in_progress_workflow_runs + if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)] + else: + # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type + queued_workflow_runs = [run for run in queued_workflow_runs + if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] + in_progress_workflow_runs = [run for run in in_progress_workflow_runs + if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] + + # Count running and queued workflows + queued_workflows = len(queued_workflow_runs) + in_progress_workflows = len(in_progress_workflow_runs) + + total_workflows = queued_workflows + in_progress_workflows + print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}") + print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}") + print(f"Total workflows: {total_workflows}") + print(f"Max concurrency: {MAX_CONCURRENCY}") + + if total_workflows >= MAX_CONCURRENCY: + print("Maximum concurrency reached, no new approvals will be made") + exit(0) + + # Get waiting CI workflows for test environment + print("Fetching deployments...") + waiting_resp = make_request("actions/runs?status=waiting") + if waiting_resp is None: + print("Failed to fetch waiting workflow runs after retries, exiting") + exit(1) + pending_workflows = waiting_resp.get("workflow_runs", []) + print("Pending workflows:", len(pending_workflows)) + pending_workflows = [run for run in pending_workflows + if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] + + # Sort deployments by creation date (oldest first) + print("Sorting workflows...") + pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"]) + + # Process each deployment + print(f"Processing {len(pending_workflows)} pending workflows...") + for workflow in pending_workflows: + if total_workflows >= MAX_CONCURRENCY: + print("Maximum concurrency reached, stopping approvals") + break + + workflow_id = workflow["id"] + workflow_name = workflow["display_title"] + print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") + + deployment_url = f"actions/runs/{workflow_id}/pending_deployments" + deployments = make_request(deployment_url) + if not deployments: + print(f"Failed to fetch pending deployments for run {workflow_id}") + exit(1) + deployment = deployments[0] + environment_id = deployment["environment"]["id"] + + # Approve the deployment + status_data = { + "environment_ids": [environment_id], + "state": "approved", + "comment": "Automatically approved by queue manager" + } + result = make_request(deployment_url, method="POST", data=status_data) + + if result: + total_workflows += 1 + else: + print(f"Failed to approve deployment {deployment['id']}") + exit(1) + notify: + if: failure() + runs-on: ubuntu-latest + needs: [approve-queue] + steps: + - name: Notify + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + run: | + curl -X POST \ + -H 'Content-type: application/json' \ + --data "{\"text\":\":robot_joy: failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \ + $SLACK_WEBHOOK diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml new file mode 100644 index 00000000000..c814bf3106f --- /dev/null +++ b/.github/workflows/cicd-main.yml @@ -0,0 +1,1148 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: CICD Megatron-LM +on: + schedule: + - cron: 0 0 * * * + push: + branches: + - "pull-request/[0-9]+" + - "deploy-release/*" + merge_group: + types: [checks_requested] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +env: + container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com + container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-lm + +jobs: + is-not-external-contributor: + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/Megatron-LM' + outputs: + is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} + is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }} + selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }} + selected_runner_gb200: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-gcp-gpu-x4' || 'ubuntu-latest' }} + permissions: + issues: write + pull-requests: write + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + REPO: ${{ github.repository }} + DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + token: ${{ env.GITHUB_TOKEN }} + + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Check NVIDIA SSO membership + id: check-sso + uses: ./.github/actions/check-nvidia-sso-membership + with: + username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} + github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} + sso_users_filename: ${{ vars.SSO_USERS_FILENAME }} + + - name: Set maintainer status + id: check-membership + env: + IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} + IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} + IS_WORKFLOW_DISPATCH: ${{ github.event_name == 'workflow_dispatch' }} + run: | + # Skip SSO check for scheduled jobs, main branch, merge groups, or manual dispatches + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ] || [ "${IS_WORKFLOW_DISPATCH}" == "true" ]; then + echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT + exit 0 + fi + + # Use SSO membership check result + IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" + + # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo + if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then + PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} + + echo "Checking if $PR_AUTHOR is a repo collaborator..." + API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" + REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." + API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" + ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." + API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" + ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + $API_URL) + + if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then + IS_MEMBER="true" + else + exit 1 + fi + fi + + # Use SSO membership check result + if [ "$IS_MEMBER" == "true" ]; then + echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT + else + echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT + fi + + pre-flight: + needs: [is-not-external-contributor] + if: github.repository == 'NVIDIA/Megatron-LM' + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v1.0.0 + + configure: + runs-on: ubuntu-latest + needs: [pre-flight] + if: github.repository == 'NVIDIA/Megatron-LM' + outputs: + scope: ${{ steps.configure.outputs.scope }} + n_repeat: ${{ steps.configure.outputs.n_repeat }} + lightweight: ${{ steps.configure.outputs.lightweight }} + lts: ${{ steps.configure.outputs.lts }} + mbridge_suite: ${{ steps.configure.outputs.mbridge_suite }} + dev: ${{ steps.configure.outputs.dev }} + cadence: ${{ steps.configure.outputs.cadence }} + cadence_bypass: ${{ steps.configure.outputs.cadence_bypass }} + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Configure + id: configure + shell: bash -x -e -u -o pipefail {0} + env: + GH_TOKEN: ${{ secrets.PAT }} + IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }} + IS_MERGE_GROUP: ${{ needs.pre-flight.outputs.is_merge_group }} + EVENT_NAME: ${{ github.event_name }} + run: | + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + + # Fetch all labels in a single API call; fall back to empty list if no PR + LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]') || LABELS='[]' + + HAS_RUN_TESTS=$(echo "$LABELS" | jq 'any(. == "Run tests")') + HAS_RUN_FUNCTIONAL=$(echo "$LABELS" | jq 'any(. == "Run functional tests")') + HAS_LTS=$(echo "$LABELS" | jq 'any(. == "container::lts")') + HAS_MBRIDGE=$(echo "$LABELS" | jq 'any(. == "Run MBridge tests")') + + if [ "$IS_MERGE_GROUP" == "true" ]; then + SCOPE=L1; N_REPEAT=1; LIGHTWEIGHT=false + elif [ "$HAS_RUN_TESTS" == "true" ]; then + SCOPE=L1; N_REPEAT=1; LIGHTWEIGHT=true + elif [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then + SCOPE=L1; N_REPEAT=5; LIGHTWEIGHT=false + elif [ "$IS_CI_WORKLOAD" == "true" ] || [ "$EVENT_NAME" == "workflow_dispatch" ]; then + # Scheduled / dispatch / release have no PR labels; default to the + # full functional tier (L1) so cadence (set below) is the + # discriminator. `workflow_dispatch` is forced into this branch + # because upstream pre-flight reports is_ci_workload=false when + # dispatched from a `pull-request/*` branch, which would otherwise + # drop us into the slim tier. + SCOPE=L1; N_REPEAT=5; LIGHTWEIGHT=false + else + SCOPE=L0; N_REPEAT=5; LIGHTWEIGHT=false + fi + + if [ "$HAS_MBRIDGE" == "true" || $IS_MERGE_GROUP == "true" ]; then + MBRIDGE_SUITE="L1" + else + MBRIDGE_SUITE="unit-only" + fi + + # Cadence: trigger-driven test selection axis (see filter_by_cadence + # in tests/test_utils/python_scripts/recipe_parser.py). PR labels + # `Run tests` and `Run functional tests` bypass the cadence filter so + # contributors retain a manual override. + if [ "$IS_MERGE_GROUP" == "true" ]; then + CADENCE=mergegroup + elif [ "$EVENT_NAME" == "schedule" ] || [ "$EVENT_NAME" == "workflow_dispatch" ]; then + CADENCE=nightly + else + CADENCE=pr + fi + + if [ "$HAS_RUN_TESTS" == "true" ] || [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then + CADENCE_BYPASS=true + CADENCE_OUTPUT="" + else + CADENCE_BYPASS=false + CADENCE_OUTPUT="$CADENCE" + fi + + DEV=true + + echo "scope=$SCOPE" | tee -a $GITHUB_OUTPUT + echo "n_repeat=$N_REPEAT" | tee -a $GITHUB_OUTPUT + echo "lightweight=$LIGHTWEIGHT" | tee -a $GITHUB_OUTPUT + echo "lts=$HAS_LTS" | tee -a $GITHUB_OUTPUT + echo "mbridge_suite=$MBRIDGE_SUITE" | tee -a $GITHUB_OUTPUT + echo "dev=$DEV" | tee -a $GITHUB_OUTPUT + echo "cadence=$CADENCE_OUTPUT" | tee -a $GITHUB_OUTPUT + echo "cadence_bypass=$CADENCE_BYPASS" | tee -a $GITHUB_OUTPUT + + # Pre-compute active row markers for the decision tree + _MG=$( [ "$IS_MERGE_GROUP" == "true" ] && echo "**→**" || echo "" ) + _RT=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" == "true" ] && echo "**→**" || echo "" ) + _RF=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" == "true" ] && echo "**→**" || echo "" ) + _CI=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" != "true" ] && [ "$IS_CI_WORKLOAD" == "true" ] && echo "**→**" || echo "" ) + _DF=$( [ "$SCOPE" == "L0" ] && echo "**→**" || echo "" ) + _LTS=$( [ "$HAS_LTS" == "true" ] && echo "**→**" || echo "" ) + _DEV=$( [ "$HAS_LTS" != "true" ] && echo "**→**" || echo "" ) + _CMG=$( [ "$CADENCE" == "mergegroup" ] && echo "**→**" || echo "" ) + _CN=$( [ "$CADENCE" == "nightly" ] && echo "**→**" || echo "" ) + _CPR=$( [ "$CADENCE" == "pr" ] && echo "**→**" || echo "" ) + + cat <> $GITHUB_STEP_SUMMARY + Beep boop 🤖 I have consulted the labels and decided to run **$SCOPE** $( [ "$LIGHTWEIGHT" == "true" ] && echo "in lightweight mode " || echo "" )against the **$( [ "$HAS_LTS" == "true" ] && echo "lts" || echo "dev" )** container with **$N_REPEAT** repetition(s). You are welcome. + + | Setting | Value | + |---|---| + | \`scope\` | \`$SCOPE\` | + | \`n_repeat\` | \`$N_REPEAT\` | + | \`lightweight\` | \`$LIGHTWEIGHT\` | + | \`lts\` | \`$HAS_LTS\` | + | \`dev\` | \`$DEV\` | + | \`mbridge_suite\` | \`$MBRIDGE_SUITE\` | + | \`cadence\` | \`$CADENCE\` | + | \`cadence_bypass\` | \`$CADENCE_BYPASS\` | + + ### Decision tree + + **Test scope** + + | | Trigger | \`scope\` | \`n_repeat\` | \`lightweight\` | + |---|---|---|---|---| + | $_MG | Merge group | \`L1\` | \`1\` | \`false\` | + | $_RT | Label: _Run tests_ | \`L1\` | \`1\` | \`true\` | + | $_RF | Label: _Run functional tests_ | \`L1\` | \`5\` | \`false\` | + | $_CI | Schedule / dispatch (CI workload) | \`L1\` | \`5\` | \`false\` | + | $_DF | _(default)_ | \`L0\` | \`5\` | \`false\` | + + **Cadence** _(filter bypassed when \`Run tests\` or \`Run functional tests\` label is set)_ + + | | Trigger | \`cadence\` | + |---|---|---| + | $_CMG | Merge group | \`mergegroup\` | + | $_CN | Schedule / dispatch | \`nightly\` | + | $_CPR | PR push (default) | \`pr\` | + + **Container image** + + | | Trigger | \`image\` | + |---|---|---| + | $_LTS | Label: _container::lts_ | \`lts\` | + | $_DEV | _(default)_ | \`dev\` | + + ### Glossary + - **\`lightweight\`**: trains for 4 steps instead of 100 and skips comparison against golden values — faster feedback, no correctness guarantees + - **\`lts\`**: uses the Long Term Support container base image instead of the latest dev image + - **\`dev\`**: uses the latest development container base image (default) + - **\`cadence\`**: per-test trigger filter (recipe \`cadence:\` field). Recipes default to \`[pr, nightly, mergegroup]\`. + SUMMARY + + linting: + runs-on: ubuntu-latest + needs: [pre-flight] + if: | + ( + needs.pre-flight.outputs.is_deployment_workflow == 'false' + && needs.pre-flight.outputs.is_ci_workload == 'true' + ) || ( + needs.pre-flight.outputs.is_deployment_workflow == 'false' + && needs.pre-flight.outputs.is_ci_workload == 'false' + && needs.pre-flight.outputs.docs_only == 'false' + ) + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@v8.1.0 + with: + version: 0.7.2 + + - name: Install linting tools + run: | + uv sync --locked --only-group linting + + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Run linting + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + run: | + export PATH=".venv/bin:$PATH" + export GITLAB_ENDPOINT=github.com + export CI_PROJECT_NAMESPACE=NVIDIA + export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" + export CHECK_ONLY=true + export SKIP_DOCS=false + bash tools/autoformat.sh + + cicd-wait-in-queue: + runs-on: ubuntu-latest + needs: [pre-flight, linting] + environment: "test" + if: | + !(needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.docs_only == 'true') + steps: + - name: Running CI tests + run: | + echo "Running CI tests" + echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}" + + cicd-parse-downstream-testing: + runs-on: ubuntu-latest + needs: + - pre-flight + - configure + - cicd-wait-in-queue + if: | + needs.pre-flight.result != 'cancelled' + && needs.configure.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + outputs: + mbridge-test-suite: ${{ needs.configure.outputs.mbridge_suite }} + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: How-To + run: bash .github/scripts/readme.sh + + cicd-mbridge-testing: + runs-on: ubuntu-latest + needs: + - pre-flight + - cicd-wait-in-queue + - cicd-parse-downstream-testing + if: | + needs.pre-flight.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-parse-downstream-testing.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Checkout MBridge and create testing branch + uses: actions/checkout@v6 + with: + ref: main + repository: NVIDIA-NeMo/Megatron-Bridge + path: megatron-bridge + token: ${{ secrets.PAT }} + + - name: Create testing branch + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + run: | + cd megatron-bridge + git fetch origin main + git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main + git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force + + - name: Get merge commit sha + shell: bash -x -e -u -o pipefail {0} + id: sha + env: + IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} + IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + run: | + if [[ "$IS_PR" == "true" ]]; then + SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} + elif [[ "$IS_MERGE_GROUP" == "true" ]]; then + SHA=${{ github.event.merge_group.head_sha }} + else + SHA=${GITHUB_SHA} + fi + echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" + + - name: Trigger MBridge tests + uses: convictional/trigger-workflow-and-wait@v1.6.5 + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + with: + owner: NVIDIA-NeMo + repo: Megatron-Bridge + workflow_file_name: cicd-main.yml + github_token: ${{ secrets.PAT }} + ref: ${{ env.MBRIDGE_BRANCH_NAME }} + wait_interval: 60 + propagate_failure: true + client_payload: | + { + "mcore_ref": "${{ steps.sha.outputs.main }}", + "test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}", + "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + + - name: Delete testing branch + if: always() + env: + MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} + run: | + cd megatron-bridge + git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }} + + cicd-compute-build-matrix: + runs-on: ubuntu-latest + needs: [is-not-external-contributor] + outputs: + matrix: ${{ steps.compute.outputs.matrix }} + steps: + - name: Compute build matrix + id: compute + env: + IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }} + SELECTED_RUNNER: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + SELECTED_RUNNER_GB200: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }} + REGISTRY_AWS: ${{ env.container-registry }} + REGISTRY_GCP: ${{ env.container-registry-gb200 }} + run: | + AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "$SELECTED_RUNNER" \ + '{"cloud": "aws", "registry": $registry, "runner": $runner}') + if [ "$IS_MAINTAINER" == "true" ]; then + GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "$SELECTED_RUNNER_GB200" \ + '{"cloud": "gcp", "registry": $registry, "runner": $runner}') + MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \ + '{"include": [$aws, $gcp]}') + else + MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}') + fi + echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT" + + cicd-container-build: + needs: [is-not-external-contributor, pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix] + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }} + runs-on: ${{ matrix.runner }} + if: | + needs.is-not-external-contributor.result != 'cancelled' + && needs.pre-flight.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-compute-build-matrix.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + ) + && !cancelled() + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Get merge commit sha + shell: bash -x -e -u -o pipefail {0} + id: sha + env: + IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} + IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + run: | + if [[ "$IS_PR" == "true" ]]; then + SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} + elif [[ "$IS_MERGE_GROUP" == "true" ]]; then + SHA=${{ github.event.merge_group.head_sha }} + else + SHA=${GITHUB_SHA} + fi + echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" + + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ steps.sha.outputs.main }} + + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: 3.12 + + - name: Install GH CLI + shell: bash -x -e -u -o pipefail {0} + run: | + for i in 1 2 3; do + apt-get update && apt-get install -y gh && break + echo "apt attempt $i failed, retrying..." + sleep 10 + done + + - name: Download test data + shell: bash + run: | + echo "::group::Download test data" + pip install --no-cache-dir click requests + python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets + echo "::endgroup::" + + - name: Get last merged PR + id: cache_from + env: + GH_TOKEN: ${{ github.token }} + run: | + LAST_PRS=$(gh api graphql -f query=' + query { + repository(owner: "NVIDIA", name: "Megatron-LM") { + pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { + nodes { + number + } + } + } + }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do + echo "type=registry,ref=${{ matrix.registry }}/megatron-lm:$number-buildcache,mode=max" + done) + + echo "LAST_PRS< unit-tests.json + echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT + + cicd-unit-tests-latest: + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }} + needs: + - is-not-external-contributor + - pre-flight + - cicd-wait-in-queue + - cicd-container-build + - cicd-parse-unit-tests + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + timeout-minutes: 60 + name: "${{ matrix.bucket }} - latest" + if: | + needs.is-not-external-contributor.result != 'cancelled' + && needs.pre-flight.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-container-build.result != 'cancelled' + && needs.cicd-parse-unit-tests.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + PIP_NO_PYTHON_VERSION_WARNING: 1 + PIP_ROOT_USER_ACTION: ignore + steps: + - name: Checkout + uses: actions/checkout@v6 + - name: main + uses: ./.github/actions + with: + test_case: ${{ matrix.bucket }} + tag: latest + timeout: ${{ matrix.timeout || 30 }} + is_unit_test: "true" + PAT: ${{ secrets.PAT }} + container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} + + cicd-parse-integration-tests-h100: + runs-on: ubuntu-latest + needs: + - pre-flight + - configure + - cicd-wait-in-queue + - cicd-container-build + - cicd-unit-tests-latest + if: | + needs.pre-flight.result != 'cancelled' + && needs.configure.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-container-build.result != 'cancelled' + && needs.cicd-unit-tests-latest.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + outputs: + integration-tests-h100: ${{ steps.main.outputs.integration-tests-h100 }} + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Parse functional tests + id: main + env: + SCOPE: ${{ needs.configure.outputs.scope }} + LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }} + CADENCE: ${{ needs.configure.outputs.cadence }} + run: | + export PYTHONPATH=$(pwd) + + ARGS=(--scope $SCOPE) + [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode) + # CADENCE is empty when label-based bypass is active; pass through + # only when set so generate_jet_trigger_job sees None and skips the filter. + [ -n "$CADENCE" ] && ARGS+=(--cadence "$CADENCE") + + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + --n-repeat 5 \ + --time-limit 2700 \ + --test-cases all \ + --container-image mcore_ci_dev \ + --container-tag latest \ + --dependent-job functional:configure \ + --record-checkpoints false \ + --slurm-account gh \ + --no-enable-warmup \ + --environment dev \ + --platform dgx_h100 \ + --cluster ghci \ + ${ARGS[@]} \ + --output-path integration-tests-h100.yaml + + cat integration-tests-h100.yaml | \ + yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-h100.json + + echo "integration-tests-h100=$(cat integration-tests-h100.json)" | tee -a "$GITHUB_OUTPUT" + + cicd-integration-tests-latest-h100: + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }} + needs: + - is-not-external-contributor + - pre-flight + - configure + - cicd-wait-in-queue + - cicd-parse-integration-tests-h100 + - cicd-unit-tests-latest + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + PIP_NO_PYTHON_VERSION_WARNING: 1 + PIP_ROOT_USER_ACTION: ignore + if: | + needs.is-not-external-contributor.result != 'cancelled' + && needs.pre-flight.result != 'cancelled' + && needs.configure.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-parse-integration-tests-h100.result != 'cancelled' + && needs.cicd-unit-tests-latest.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + steps: + - name: Checkout + uses: actions/checkout@v6 + - name: main + uses: ./.github/actions + with: + test_case: ${{ matrix.test_case }} + model: ${{ matrix.model }} + tag: latest + timeout: ${{ matrix.timeout || 30 }} + is_unit_test: "false" + PAT: ${{ secrets.PAT }} + container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} + scope: ${{ needs.configure.outputs.scope }} + n_repeat: ${{ needs.configure.outputs.n_repeat }} + lightweight: ${{ needs.configure.outputs.lightweight }} + cadence: ${{ needs.configure.outputs.cadence }} + + cicd-parse-integration-tests-gb200: + runs-on: ubuntu-latest + needs: + - is-not-external-contributor + - pre-flight + - configure + - cicd-wait-in-queue + - cicd-container-build + - cicd-unit-tests-latest + if: | + needs.is-not-external-contributor.outputs.is_maintainer == 'true' + && needs.pre-flight.result != 'cancelled' + && needs.configure.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-container-build.result != 'cancelled' + && needs.cicd-unit-tests-latest.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + outputs: + integration-tests-gb200: ${{ steps.main.outputs.integration-tests-gb200 }} + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Parse functional tests + id: main + env: + SCOPE: ${{ needs.configure.outputs.scope }} + LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }} + CADENCE: ${{ needs.configure.outputs.cadence }} + run: | + export PYTHONPATH=$(pwd) + + ARGS=(--scope $SCOPE) + [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode) + # CADENCE is empty when label-based bypass is active; pass through + # only when set so generate_jet_trigger_job sees None and skips the filter. + [ -n "$CADENCE" ] && ARGS+=(--cadence "$CADENCE") + + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + --n-repeat 5 \ + --time-limit 2700 \ + --test-cases all \ + --container-image mcore_ci_dev \ + --container-tag latest \ + --dependent-job functional:configure \ + --record-checkpoints false \ + --slurm-account gh \ + --no-enable-warmup \ + --environment dev \ + --platform dgx_gb200 \ + --cluster dgxgb200_oci-hsg \ + ${ARGS[@]} \ + --output-path integration-tests-gb200.yaml + + cat integration-tests-gb200.yaml | \ + yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-gb200.json + + echo "integration-tests-gb200=$(cat integration-tests-gb200.json)" | tee -a "$GITHUB_OUTPUT" + + cicd-integration-tests-latest-gb200: + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }} + needs: + - is-not-external-contributor + - pre-flight + - configure + - cicd-wait-in-queue + - cicd-parse-integration-tests-gb200 + - cicd-unit-tests-latest + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }} + name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + PIP_NO_PYTHON_VERSION_WARNING: 1 + PIP_ROOT_USER_ACTION: ignore + if: | + needs.is-not-external-contributor.outputs.is_maintainer == 'true' + && needs.is-not-external-contributor.result != 'cancelled' + && needs.pre-flight.result != 'cancelled' + && needs.configure.result != 'cancelled' + && needs.cicd-wait-in-queue.result != 'cancelled' + && needs.cicd-parse-integration-tests-gb200.result != 'cancelled' + && needs.cicd-unit-tests-latest.result != 'cancelled' + && ( + success() + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.force_run_all == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + ) + && !cancelled() + steps: + - name: Checkout + uses: actions/checkout@v6 + - name: main + uses: ./.github/actions + with: + test_case: ${{ matrix.test_case }} + model: ${{ matrix.model }} + tag: latest + timeout: ${{ matrix.timeout || 30 }} + is_unit_test: "false" + PAT: ${{ secrets.PAT }} + container-image: ${{ env.container-registry-gb200 }}/megatron-lm:${{ github.sha }} + scope: ${{ needs.configure.outputs.scope }} + n_repeat: ${{ needs.configure.outputs.n_repeat }} + lightweight: ${{ needs.configure.outputs.lightweight }} + platform: dgx_gb200 + cadence: ${{ needs.configure.outputs.cadence }} + + Nemo_CICD_Test: + needs: + - pre-flight + - is-not-external-contributor + - cicd-unit-tests-latest + - cicd-integration-tests-latest-h100 + - cicd-integration-tests-latest-gb200 + if: | + ( + needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + || needs.pre-flight.outputs.is_ci_workload == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || always() + ) + && !cancelled() + && github.repository == 'NVIDIA/Megatron-LM' + runs-on: ubuntu-latest + permissions: write-all + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Get workflow result + id: result + shell: bash -x -e -u -o pipefail {0} + env: + GH_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }} + IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }} + IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }} + UNIT_RESULT: ${{ needs.cicd-unit-tests-latest.result }} + H100_RESULT: ${{ needs.cicd-integration-tests-latest-h100.result }} + GB200_RESULT: ${{ needs.cicd-integration-tests-latest-gb200.result }} + run: | + # Docs-only and deployment workflows intentionally skip all tests + if [ "$DOCS_ONLY" == "true" ] || [ "$IS_DEPLOYMENT" == "true" ]; then + echo "✅ Docs-only or deployment workflow — test checks skipped" + exit 0 + fi + + FAILED=false + + # Unit tests must always succeed (never skipped or cancelled) + if [ "$UNIT_RESULT" != "success" ]; then + echo "❌ cicd-unit-tests-latest: $UNIT_RESULT" + FAILED=true + fi + + # H100 integration tests must always succeed + if [ "$H100_RESULT" != "success" ]; then + echo "❌ cicd-integration-tests-latest-h100: $H100_RESULT" + FAILED=true + fi + + # GB200 integration tests may be skipped only for non-maintainer PRs + # (no GB200 runners available); maintainer runs must always succeed + if [ "$GB200_RESULT" == "skipped" ] && [ "$IS_MAINTAINER" == "true" ]; then + echo "❌ cicd-integration-tests-latest-gb200: skipped unexpectedly for a maintainer run" + FAILED=true + elif [ "$GB200_RESULT" != "success" ] && [ "$GB200_RESULT" != "skipped" ]; then + echo "❌ cicd-integration-tests-latest-gb200: $GB200_RESULT" + FAILED=true + fi + + # Broad scan: catch any individual job failures or cancellations + # (e.g. a single matrix instance cancelled mid-run) + BAD_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq ' + [.jobs[] | select( + .status == "completed" + and (.conclusion == "failure" or .conclusion == "cancelled") + and .name != "merge-queue-notification" + and .name != "cicd-mbridge-testing" + )] | length + ') || BAD_JOBS=0 + + if [ "${BAD_JOBS:-0}" -gt 0 ]; then + echo "❌ Found ${BAD_JOBS} failed or cancelled job(s):" + gh run view $GITHUB_RUN_ID --json jobs --jq ' + .jobs[] | select( + .status == "completed" + and (.conclusion == "failure" or .conclusion == "cancelled") + and .name != "merge-queue-notification" + and .name != "cicd-mbridge-testing" + ) | .name + " → " + .conclusion + ' + FAILED=true + fi + + if [ "$FAILED" != "true" ]; then + echo "✅ All previous jobs completed successfully" + else + exit 1 + fi + + Coverage_Fake: + runs-on: ubuntu-latest + needs: [Nemo_CICD_Test, pre-flight] + if: | + ( + needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + ) + && needs.pre-flight.outputs.is_ci_workload == 'false' + && !cancelled() + && github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Generate fake coverage report + uses: actions/github-script@v8 + with: + github-token: ${{ secrets.PAT }} + script: | + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: context.sha, + state: 'success', + description: 'No code changes - coverage check skipped', + context: 'codecov/patch' + }); + + Coverage: + runs-on: ubuntu-latest + needs: [Nemo_CICD_Test] + if: | + ( + (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure()) + || (needs.pre-flight.outputs.is_merge_group == 'true' && !failure()) + || success() + ) + && !cancelled() + && github.repository == 'NVIDIA/Megatron-LM' + strategy: + matrix: + flag: [unit-test] + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' + uses: nv-gha-runners/get-pr-info@main + + - name: Checkout + uses: actions/checkout@v6 + + - name: Download coverage reports of current branch + uses: actions/download-artifact@v7 + with: + pattern: coverage-${{ matrix.flag }}-* + + - name: List coverage files + run: find . -type f -name "*.xml" -o -name "*.lcov" + + - name: Get total coverage of current branch + shell: bash -x -e -u -o pipefail {0} + if: always() + run: | + pip install coverage + + ls -al . + ls -al coverage-*/ + coverage combine --keep $(ls coverage-*/.coverage) + coverage report -i + rm -rf coverage-* + ls -al + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true + flags: ${{ matrix.flag }} + base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').base.sha }} + + - name: Upload artifacts + uses: actions/upload-artifact@v6 + with: + name: coverage-${{ matrix.flag }}-aggregated + path: | + .coverage + include-hidden-files: true + + merge-queue-notification: + runs-on: ubuntu-latest + if: github.event_name == 'merge_group' + permissions: + pull-requests: write + steps: + - name: Extract PR number from merge group + id: get-pr-number + run: | + # Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr--) + PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p') + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + + - name: Comment on PR with action run URL + uses: actions/github-script@v8 + with: + github-token: ${{ secrets.PAT }} + script: | + const prNumber = ${{ steps.get-pr-number.outputs.pr_number }}; + const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}` + }); + + cleanup-taint-node: + runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} + needs: + - is-not-external-contributor + - cicd-container-build + - cicd-unit-tests-latest + - cicd-integration-tests-latest-h100 + - cicd-integration-tests-latest-gb200 + - Coverage + - Coverage_Fake + if: | + always() + && !cancelled() + && contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') + && !needs.pre-flight.outputs.is_deployment_workflow == 'true' + steps: + - name: Taint node for cleanup + shell: bash + run: taint-node.sh diff --git a/.github/workflows/claude-complexity-label.yml b/.github/workflows/claude-complexity-label.yml new file mode 100644 index 00000000000..356eed2da29 --- /dev/null +++ b/.github/workflows/claude-complexity-label.yml @@ -0,0 +1,60 @@ +name: Claude Complexity Label + +on: + pull_request_target: + types: [ready_for_review] + +jobs: + label-complexity: + name: Label PR Complexity + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + env: + GH_TOKEN: ${{ secrets.PAT }} + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Run Claude Complexity Analysis + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + github_token: ${{ secrets.PAT }} + prompt: | + REPO: ${{ env.REPO }} + PR NUMBER: ${{ env.PR_NUMBER }} + + You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label. + + STEPS: + 1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO + 2. Analyze every changed line (added or removed) in the diff and classify each as one of: + - "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text + - "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory) + - "real code": all other changes (functional source code) + 3. Compute "real code line changes" using this formula: + real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10) + Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings. + 4. Remove any previously applied complexity or docs-only labels: + gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only" + 5. Apply exactly ONE label using the gh CLI: + - If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only": + gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only" + - If real_code_line_changes < 100, apply label "complexity: low": + gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low" + - If real_code_line_changes >= 100 and < 500, apply label "complexity: medium": + gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium" + - If real_code_line_changes >= 500, apply label "complexity: high": + gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high" + + Do NOT post any comments on the PR. Only apply the label. + claude_args: | + --allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)" diff --git a/.github/workflows/claude-copy-to-main.yml b/.github/workflows/claude-copy-to-main.yml new file mode 100644 index 00000000000..7bde3941bb8 --- /dev/null +++ b/.github/workflows/claude-copy-to-main.yml @@ -0,0 +1,122 @@ +name: Claude Copy PR to Main + +on: + issue_comment: + types: [created] + +jobs: + copy-to-main: + name: Copy PR to Main + if: | + github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '/claude copy') + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + issues: write + id-token: write + env: + GH_TOKEN: ${{ secrets.PAT }} + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.issue.number }} + steps: + - name: Check commenter has write access + env: + COMMENTER: ${{ github.event.comment.user.login }} + run: | + PERMISSION=$(gh api repos/$REPO/collaborators/$COMMENTER/permission --jq .permission) + if [[ "$PERMISSION" != "admin" && "$PERMISSION" != "write" ]]; then + gh pr comment $PR_NUMBER --repo $REPO --body "❌ You do not have write access to use \`/claude copy\`." + exit 1 + fi + + - name: Check PR is merged and targets non-main + run: | + PR_JSON=$(gh pr view $PR_NUMBER --repo $REPO --json baseRefName,mergedAt) + PR_BASE=$(echo "$PR_JSON" | jq -r .baseRefName) + PR_MERGED=$(echo "$PR_JSON" | jq -r .mergedAt) + + if [ "$PR_BASE" = "main" ]; then + gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR already targets \`main\`. \`/claude copy\` only works on PRs targeting non-main branches." + exit 1 + fi + + if [ "$PR_MERGED" = "null" ] || [ -z "$PR_MERGED" ]; then + gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR has not been merged yet. \`/claude copy\` only works on merged PRs." + exit 1 + fi + + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + token: ${{ secrets.PAT }} + + - name: Fetch PR head ref from fork + run: | + git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER-head + + - name: Run Claude Copy to Main + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + trigger_phrase: "/claude copy" + github_token: ${{ secrets.PAT }} + prompt: | + REPO: ${{ env.REPO }} + PR NUMBER: ${{ env.PR_NUMBER }} + + You are a PR copy assistant. Your job is to apply the final changes from a merged PR onto a new branch based on `main` and create a new PR targeting `main`. + + The PR's commits originated from a fork and have been fetched locally as the branch: pr-${PR_NUMBER}-head + + STEPS: + 1. Get the PR details (title, body, and base branch): + gh pr view $PR_NUMBER --repo $REPO --json title,body,baseRefName + + 2. Configure git for committing (use the svcnvidia-nemo-ci service account since secrets.PAT belongs to it): + git config user.name "svcnvidia-nemo-ci" + git config user.email "svcnvidia-nemo-ci@nvidia.com" + + 3. Create a new branch from `main`: + git checkout main + git pull origin main + git checkout -b copy-pr-${PR_NUMBER}-to-main + + 4. Generate a patch of the PR's final changes and apply it: + MERGE_BASE=$(git merge-base origin/ pr-${PR_NUMBER}-head) + git diff $MERGE_BASE pr-${PR_NUMBER}-head | git apply --3way + (Replace with the actual base branch name from step 1.) + + If the apply fails due to merge conflicts: + a. Identify conflicted files: git diff --name-only --diff-filter=U + b. For each conflicted file, read its contents to see the conflict markers + c. Resolve the conflicts by favoring the `main` branch side when there is a genuine + conflict between the two sides. The goal is to bring the PR's changes into main + without overriding what is already on main. + d. Stage the resolved files: git add + + 5. Commit the changes: + git add -A + git commit -m "Copy PR #${PR_NUMBER} to main" + + 6. Push the new branch: + git push origin copy-pr-${PR_NUMBER}-to-main + + 7. Create a new PR targeting `main`: + gh pr create --repo $REPO \ + --base main \ + --head copy-pr-${PR_NUMBER}-to-main \ + --title "[Copy to main] " \ + --body "🤖 **This PR was auto-generated by Claude** via the \`/claude copy\` command.\n\nCherry-picked from #${PR_NUMBER}.\n\n---\n\n" + + 8. Comment on the original PR with a link to the newly created PR. + + IMPORTANT: + - When resolving merge conflicts, favor `main` over the non-main branch. Do not override changes already on main. + - Do NOT force push. + claude_args: | + --allowedTools "Bash(git:*),Bash(gh:*),Read,Edit" + --model "claude-opus-4-6" diff --git a/.github/workflows/claude_review.yml b/.github/workflows/claude_review.yml new file mode 100644 index 00000000000..b7d5f1217c0 --- /dev/null +++ b/.github/workflows/claude_review.yml @@ -0,0 +1,271 @@ +name: Claude Code Review + +on: + issue_comment: + types: [created] + +jobs: + # ────────────────────────────────────────────────────────────────── + # Light review: quick pass for obvious bugs, typos, and test gaps + # Trigger: /claude review + # ────────────────────────────────────────────────────────────────── + light-review: + name: Claude Light Review + if: | + github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '/claude review') + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.issue.number }} + steps: + - name: Get PR head commit + id: get-pr-head-commit + run: | + echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT + + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 1 + ref: ${{ steps.get-pr-head-commit.outputs.sha }} + + - name: React to trigger comment + run: | + gh api repos/$REPO/issues/comments/${{ github.event.comment.id }}/reactions \ + --method POST \ + -f content='eyes' + + - name: Run Claude Light Review + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + trigger_phrase: "/claude review" + show_full_output: true + claude_args: | + --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*),Read" + --model "claude-opus-4-6" + prompt: | + REPO: ${{ env.REPO }} + PR NUMBER: ${{ env.PR_NUMBER }} + + Mandatory workflow — never skip or reorder: + 1. Read the PR diff first (gh pr diff). + 2. Based on the changed files and areas, identify relevant skills from skills//SKILL.md. + Common skill names: build-and-dependency, testing, cicd, linting-and-formatting, run-on-slurm, + nightly-sync, create-issue, respond-to-issue, split-pr, onboard-gb200-1node-tests. + 3. Read the SKILL.md files for all relevant areas using the Read tool. + 4. Only then perform the review using the skill context. + + You are doing a light code review. Keep it concise and actionable. + + Focus ONLY on: + - Critical bugs or logic errors + - Typos in code, comments, or strings + - Missing or insufficient test coverage for changed code + - If the PR adds a new feature or significant functionality without corresponding tests, suggest adding tests + - If the PR fixes a bug that was not caught by an existing unit test, suggest adding a regression test to prevent recurrence + - Outdated or inaccurate documentation affected by the changes + + Do NOT comment on: + - Style preferences or formatting + - Minor naming suggestions + - Architectural opinions or refactoring ideas + - Performance unless there is a clear, measurable issue + + Only use inline ```suggestion blocks for simple, self-contained line replacements (typos, + renames, single-line fixes). For structural changes that add, remove, or reorganize blocks + of code (e.g. adding a new function, inserting a YAML step, reordering logic), use a + top-level PR comment with a code block showing the proposed change instead — inline + suggestions cannot express insertions or multi-block restructuring and will break the code + if applied. + + It's perfectly acceptable to not have anything to comment on. + If you do not have anything to comment on, approve the PR with: gh pr review $PR_NUMBER --repo $REPO --approve --body "LGTM" + + # ────────────────────────────────────────────────────────────────── + # Strict review: comprehensive Megatron-LM focused analysis + # covering precision, parallelism correctness, performance, + # backward compatibility, and code quality + # Trigger: /claude strict-review + # ────────────────────────────────────────────────────────────────── + strict-review: + name: Claude Strict Review + if: | + github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '/claude strict-review') + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.issue.number }} + steps: + - name: Get PR info + id: pr-info + run: | + PR_DATA=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid,baseRefName) + echo "sha=$(echo $PR_DATA | jq -r .headRefOid)" >> $GITHUB_OUTPUT + echo "base_ref=$(echo $PR_DATA | jq -r .baseRefName)" >> $GITHUB_OUTPUT + + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 1 + ref: ${{ steps.pr-info.outputs.sha }} + + - name: Fetch base branch for diff analysis + run: git fetch origin ${{ steps.pr-info.outputs.base_ref }} + + - name: React to trigger comment + run: | + gh api repos/$REPO/issues/comments/${{ github.event.comment.id }}/reactions \ + --method POST \ + -f content='eyes' + + - name: Run Claude Strict Review + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + trigger_phrase: "/claude strict-review" + show_full_output: true + claude_args: | + --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*),Bash(git diff:*),Bash(git show:*),Bash(git log:*),Read" + --model "claude-opus-4-6" + prompt: | + REPO: ${{ env.REPO }} + PR NUMBER: ${{ env.PR_NUMBER }} + BASE REF: origin/${{ steps.pr-info.outputs.base_ref }} + + Mandatory workflow — never skip or reorder: + 1. Read the PR diff first (gh pr diff). + 2. Based on the changed files and areas, identify relevant skills from skills//SKILL.md. + Common skill names: build-and-dependency, testing, cicd, linting-and-formatting, run-on-slurm, + nightly-sync, create-issue, respond-to-issue, split-pr, onboard-gb200-1node-tests. + 3. Read the SKILL.md files for all relevant areas using the Read tool. + 4. Only then perform the review using the skill context. + + You are performing a strict, comprehensive code review on a **Megatron-LM** Pull Request. + Megatron-LM is NVIDIA's large-scale distributed training framework for LLMs. + Review the diff with a focus on **implementation correctness**, **training performance**, and **backward compatibility**. + + ## Review Procedure + + 1. Get PR metadata: `gh pr view $PR_NUMBER --repo $REPO --json title,body,baseRefName,headRefName,files,additions,deletions,changedFiles,author` + 2. Get the full diff: `gh pr diff $PR_NUMBER --repo $REPO` + - For large PRs (>50 files), prioritize source code over config/lock/auto-generated files. + 3. For each significant changed file, read the full file for surrounding context. + 4. Trace data flow and dtype through computation paths to verify correctness. + 5. For each newly introduced variable/argument/field, verify it has a meaningful runtime use path (see Mandatory Check below). + 6. Post findings as inline comments with severity and category tags. + + ## Critical Issues (Must Fix) + + ### Implementation Correctness + - **dtype handling**: Verify operations use the correct dtype at each computation stage — explicit casts must be present at mixed-precision boundaries (e.g. fp16 compute → fp32 accumulation → fp16 output) + - **Loss scaling logic**: Verify DynamicLossScaler changes correctly detect inf/nan, adjust scale factor, and skip optimizer steps — incorrect logic causes training divergence or silent underflow + - **Reduction operations**: Verify reductions (sum, mean, allreduce) use correct dtype, reduction dimension, and normalization factor — wrong dimension or missing fp32 upcast produces silently wrong gradients + - **Normalization layers**: Verify LayerNorm/RMSNorm compute variance and mean on the correct dimension, with correct epsilon placement and upcast before rsqrt + - **Attention computation**: Verify QK^T scaling factor, softmax input dtype, causal mask application, and dropout placement match the intended algorithm + - **Residual connections**: Verify the correct tensor is added (pre-norm vs post-norm) with appropriate dtype for accumulation + - **Optimizer updates**: Verify state updates follow the correct formula — momentum/variance update order, bias correction, weight decay application + - **Gradient clipping**: Verify norm computation uses correct parameter set, norm type (L2 vs inf), and fp32 dtype + - **Embedding/output layer**: Verify weight tying is correctly wired, logit projection uses the right matrix, and output dtype matches expectation + - **MoE routing/aux loss**: Verify expert routing logic (top-k selection, capacity enforcement, token dropping) and auxiliary loss computation follow the intended algorithm + + ### Correctness + - **Tensor parallel**: Incorrect scatter/gather or allreduce placement — silent wrong results across TP ranks + - **Pipeline parallel**: Wrong microbatch scheduling, missing send/recv synchronization, incorrect grad accumulation across pipeline stages + - **Sequence parallel**: Incorrect sequence dimension partitioning or missing allgather/reduce-scatter in SP regions + - **Context parallel**: Incorrect KV cache partitioning or ring attention implementation errors + - **Expert parallel**: Token routing/dispatch errors across EP ranks, incorrect capacity factor handling + - **Gradient accumulation**: Missing no_sync() context or incorrect division factor when accumulating across microbatches + - **Checkpoint save/load**: State dict key mismatch, missing optimizer states, incorrect RNG state restoration — causes silent divergence after resume + - **RNG state management**: Incorrect random seed handling across TP/PP/DP ranks, causing correlated dropout masks or data sampling + + ## Important Issues (Should Fix) + + ### Training Performance + - **Unnecessary CPU-GPU sync**: .item(), .cpu(), torch.cuda.synchronize(), Python-side tensor value checks in training loop — kills throughput + - **Redundant communication**: Allreduce/allgather that could be fused, overlapped with compute, or eliminated + - **Memory inefficiency**: Missing activation checkpointing on memory-heavy layers, unnecessary tensor clones or .contiguous() calls + - **Communication-computation overlap**: Missed opportunities to overlap allreduce with backward, or allgather with forward + - **Kernel launch overhead**: Python loops over small ops that should be fused into a single kernel + - **CUDA graph compatibility**: Dynamic shapes, Python-side conditionals on tensor values, host-device sync inside captured region + + ### Backward Compatibility + - **Config/argument changes**: Renamed or removed arguments without deprecation path — breaks existing training scripts + - **Checkpoint format changes**: Modified state dict keys/structure without migration logic — makes existing checkpoints unloadable + - **Default value changes**: Changed defaults for training hyperparameters or parallelism settings — silently alters behavior for users relying on defaults + - **API contract changes**: Changed function signatures, return types, or side effects in megatron/core/ without backward-compat shim + - **Model architecture changes**: Altered layer ordering, initialization, or normalization placement — existing pretrained weights become incompatible + + ### Mandatory Check: Unused New Variables / Arguments + - For each changed file, list newly added identifiers (function args, config fields, locals). + - Verify each has a meaningful read/use path — not just declaration/docstring or discard assignment (_ = new_arg). + - Use Grep to search for usage beyond declaration sites. + - Treat placeholder discard patterns as findings unless explicitly documented as temporary migration shim. + - If usage is intentionally deferred, flag and request explicit TODO + migration note. + + ## Suggestions (Nice to Have) + + ### Naming + - Name must describe what the thing *is*, not what it's *used for* + - No abbreviations in parallel/distributed code — use full names (token_dispatcher, routing_map, comm_manager, world_size) + - Naming consistency within scope for variables serving the same role + + ### Function/Method Decomposition + - Functions over ~50 lines mixing data collection, reduction, computation, and I/O should be split + - Non-trivial logic blocks embedded in a method with different primary purpose should be extracted + + ### Simplification + - Redundant operations (e.g. .reshape(()) on 0-dim tensor, two-step constructions where one suffices) + - Setup constant across training should not run on every forward pass — move to __init__ + - Dead complexity that doesn't achieve its stated purpose + - Unnecessary intermediate aliases adding indirection with no abstraction value + + ### Other + - Stale, imprecise, or misleading comments/docstrings — a wrong docstring is worse than none + - Missing shape/dtype assertions at parallelism boundaries + + ## What NOT to Comment On + - Style/formatting issues (leave to linters) + - Test code that is reasonably clear + - Clearly intentional design decisions by the author + - Pure refactoring that preserves identical behavior (verify via diff) + - Findings invalidated by deeper analysis — drop them entirely rather than hedging + + ## Comment Format + + Prefix each comment with severity and category tag: + - `**[CRITICAL Implementation]**`, `**[CRITICAL Correctness]**` + - `**[IMPORTANT Performance]**`, `**[IMPORTANT Compatibility]**` + - `**[SUGGESTION Naming]**`, `**[SUGGESTION Simplification]**` + + For each finding, explain: (1) what the issue is, (2) why it matters (impact/risk), (3) specific suggestion for fix. + + Only use inline ```suggestion blocks for simple, self-contained line replacements (typos, + renames, single-line fixes). For structural changes that add, remove, or reorganize blocks + of code, use a top-level PR comment with a code block showing the proposed change instead. + + ## Completion + + After posting all inline comments, post a summary PR comment: + - List total findings by severity (CRITICAL: N, IMPORTANT: N, SUGGESTION: N) + - Highlight the most impactful findings + - Overall assessment of the PR's risk level + + If no significant issues are found, approve the PR: + gh pr review $PR_NUMBER --repo $REPO --approve --body "Strict review passed — no significant issues found. LGTM" diff --git a/.github/workflows/close-inactive-issue-pr.yml b/.github/workflows/close-inactive-issue-pr.yml new file mode 100644 index 00000000000..7dcac837ba9 --- /dev/null +++ b/.github/workflows/close-inactive-issue-pr.yml @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Stale-Close-Inactive-Issues-PRs +on: + schedule: + - cron: "30 1 * * *" + +jobs: + close-issues: + if: github.repository == 'NVIDIA/Megatron-LM' + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_close_inactive_issue_pr.yml@v0.44.0 diff --git a/.github/workflows/community-bot.yml b/.github/workflows/community-bot.yml new file mode 100644 index 00000000000..1a98ece0f85 --- /dev/null +++ b/.github/workflows/community-bot.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Community Bot + +on: + issues: + types: [opened, edited, reopened, closed, deleted] + issue_comment: + types: [created, edited, deleted] + +jobs: + community-bot: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10 + with: + community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }} + if: github.repository == 'NVIDIA/Megatron-LM' + secrets: + GH_TOKEN: ${{ secrets.PAT }} diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json new file mode 100644 index 00000000000..19fb0e42364 --- /dev/null +++ b/.github/workflows/config/changelog-config.json @@ -0,0 +1,24 @@ +{ + "categories": [], + "ignore_labels": [ + "ignore" + ], + "sort": "ASC", + "template": "\n${{CHANGELOG}}\n\n
Changelog Details\n\n${{UNCATEGORIZED}}\n
\n", + "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}", + "commit_template": "- ${{TITLE}} by @${{AUTHOR}}", + "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}", + "duplicate_filter": { + "pattern": ".+", + "on_property": "title", + "method": "match" + }, + "transformers": [], + "max_tags_to_fetch": 100, + "max_pull_requests": 1250, + "max_back_track_time_days": 365, + "exclude_merge_branches": [], + "tag_resolver": { + "method": "semver" + } +} diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml new file mode 100644 index 00000000000..484a66fb0e0 --- /dev/null +++ b/.github/workflows/copyright-check.yml @@ -0,0 +1,70 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Copyright check + +on: + push: + branches: + - "pull-request/[0-9]+" + - "deploy-release/*" + merge_group: + types: [checks_requested] + +jobs: + pre-flight: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v1.0.0 + if: github.repository == 'NVIDIA/Megatron-LM' + + copyright-check: + needs: [pre-flight] + if: | + !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + && github.repository == 'NVIDIA/Megatron-LM' + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v1.0.0 + + copyright-check-summary: + needs: [pre-flight, copyright-check] + if: | + ( + needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + || always() + ) + && !cancelled() + && github.repository == 'NVIDIA/Megatron-LM' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Result + env: + GH_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 + + if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + # Show which jobs failed + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' + exit 1 + fi diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml new file mode 100644 index 00000000000..81a5cd57d12 --- /dev/null +++ b/.github/workflows/dependabot.yml @@ -0,0 +1,61 @@ +name: Dependabot +on: + schedule: + - cron: "0 8 * * 1" + workflow_dispatch: # Allow manual triggering + +permissions: + id-token: write + contents: write + +jobs: + get-release-branch-names: + runs-on: ubuntu-latest + outputs: + mcore: ${{ steps.get-branch.outputs.mcore_release_branch }} + if: github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Get release branch names + id: get-branch + env: + PAT: ${{ secrets.PAT }} + run: | + latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' | + grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' | + sort -V | + tail -n1) + echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT + + bump-tags: + needs: [get-release-branch-names] + if: github.repository == 'NVIDIA/Megatron-LM' + strategy: + fail-fast: false + matrix: + include: + - target-branch: ${{ needs.get-release-branch-names.outputs.mcore }} + - target-branch: main + uses: ./.github/workflows/_update_dependencies.yml + with: + target-branch: ${{ matrix.target-branch }} + secrets: + PAT: ${{ secrets.PAT }} + SSH_KEY: ${{ secrets.SSH_KEY }} + SSH_PWD: ${{ secrets.SSH_PWD }} + + notify: + if: failure() && github.repository == 'NVIDIA/Megatron-LM' + runs-on: ubuntu-latest + needs: [bump-tags] + steps: + - name: Notify + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + run: | + curl -X POST \ + -H 'Content-type: application/json' \ + --data "{\"text\":\":robot_joy: failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \ + $SLACK_WEBHOOK diff --git a/.github/workflows/force-draft-pr.yml b/.github/workflows/force-draft-pr.yml new file mode 100644 index 00000000000..d45dabf14b7 --- /dev/null +++ b/.github/workflows/force-draft-pr.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +name: Force Draft PR + +on: + pull_request_target: + types: [opened] + branches: + - main + +permissions: + pull-requests: write + +jobs: + force-draft: + runs-on: ubuntu-latest + if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }} + steps: + - name: Convert PR to draft + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }} + + - name: Add comment explaining draft policy + env: + GH_TOKEN: ${{ github.token }} + run: | + gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \ + "This PR has been automatically converted to **draft** because all PRs must start as drafts. + + When you are ready for review, click **Ready for Review** to begin the review process. This will: + 1. Add the oncall reviewer (optional reviewer) + 2. Add required review teams based on your changes + + See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details." diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml new file mode 100644 index 00000000000..f340e5aa2d8 --- /dev/null +++ b/.github/workflows/install-test.yml @@ -0,0 +1,162 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow verifies that the basic install works across all supported platforms. +# For basic install, all imports need to either be successful or appropriately guarded. + +name: Installation Test + +on: + push: + branches: + - dev + - main + - "pull-request/[0-9]+" + - "deploy-release/*" + merge_group: + types: [checks_requested] + +jobs: + pre-flight: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v1.0.0 + if: github.repository == 'NVIDIA/Megatron-LM' + + pip-test-pytorch: + needs: [pre-flight] + if: | + !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + && github.repository == 'NVIDIA/Megatron-LM' + runs-on: linux-amd64-cpu16 + name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch + container: + image: nvcr.io/nvidia/pytorch:25.05-py3 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set PATH + run: | + echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV" + echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV" + echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV" + echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV" + echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV" + + - name: Install megatron-core + shell: bash -x -e -u -o pipefail {0} + run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }} + + - name: Checkout check-imports + uses: actions/checkout@v6 + with: + repository: NVIDIA-NeMo/FW-CI-templates + ref: v0.63.2 + path: FW-CI-templates + + - name: Check imports for megatron-core + uses: ./FW-CI-templates/.github/actions/check-imports + with: + package-name: megatron.core + python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python + + uv-test-pytorch: + needs: [pre-flight] + if: | + !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + && github.repository == 'NVIDIA/Megatron-LM' + runs-on: linux-amd64-cpu16 + name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch + container: + image: nvcr.io/nvidia/pytorch:25.05-py3 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set PATH + run: | + echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV" + echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV" + echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV" + echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV" + echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV" + echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV" + echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV" + + - name: Install project + shell: bash + run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv + + # NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines. + # - name: Checkout check-imports + # uses: actions/checkout@v6 + # with: + # repository: NVIDIA-NeMo/FW-CI-templates + # ref: v0.63.2 + # path: FW-CI-templates + + # - name: Check imports for megatron-core + # uses: ./FW-CI-templates/.github/actions/check-imports + # with: + # package-name: megatron.core + # python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python + + install-test-summary: + needs: [pre-flight, pip-test-pytorch, uv-test-pytorch] + runs-on: ubuntu-latest + name: Install test summary + if: | + ( + needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + || always() + ) + && !cancelled() + && github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Get workflow result + id: result + shell: bash -x -e -u -o pipefail {0} + env: + GH_TOKEN: ${{ github.token }} + RUN_ID: ${{ github.run_id }} + SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }} + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 + + if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + # Show which jobs failed + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' + exit 1 + fi diff --git a/.github/workflows/multi-approval-bot.yml b/.github/workflows/multi-approval-bot.yml new file mode 100644 index 00000000000..63776ada338 --- /dev/null +++ b/.github/workflows/multi-approval-bot.yml @@ -0,0 +1,74 @@ +name: "Codeowners Approval Workflow" + +on: + push: + branches: + - "pull-request/[0-9]+" + merge_group: + types: [checks_requested] + +jobs: + pre-flight: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v1.0.0 + if: github.repository == 'NVIDIA/Megatron-LM' + + codeowners-approval: + needs: [pre-flight] + runs-on: ubuntu-latest + if: | + !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + steps: + - name: Get PR info + id: get-pr-info + if: startsWith(github.ref, 'refs/heads/pull-request/') + uses: nv-gha-runners/get-pr-info@main + + - name: Checkout action + uses: actions/checkout@v6 + with: + repository: noamelf/codeowner-multi-approval-action + ref: v0.1 + path: codeowner-multi-approval-action + + - name: Check Codeowners Approval + uses: ./codeowner-multi-approval-action + with: + pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + repo-name: ${{ github.repository }} + github-token: ${{ secrets.PAT }} + + multi-approval-bot-summary: + needs: [pre-flight, codeowners-approval] + if: | + ( + needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true' + || always() + ) + && github.repository == 'NVIDIA/Megatron-LM' + && !cancelled() + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Result + env: + GH_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 + + if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + # Show which jobs failed + gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' + exit 1 + fi diff --git a/.github/workflows/nightly-sync-main-to-dev.yml b/.github/workflows/nightly-sync-main-to-dev.yml new file mode 100644 index 00000000000..d7c9e46811d --- /dev/null +++ b/.github/workflows/nightly-sync-main-to-dev.yml @@ -0,0 +1,217 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Nightly Sync Main to Dev + +on: + workflow_dispatch: + schedule: + # 21:00 UTC = 2 PM PDT (1 PM PST during winter — GitHub Actions cron + # is UTC-only and does not follow DST). + - cron: '0 21 * * *' + +concurrency: + group: nightly-sync-main-to-dev + cancel-in-progress: false + +permissions: + contents: write + pull-requests: write + issues: write + id-token: write + +jobs: + # Re-dispatch scheduled runs as workflow_dispatch via a PAT so the heavy + # job runs with a real User-type actor. On `schedule` events GitHub sets + # `github.actor` to `github-merge-queue` (no Users-API entry), which + # crashes anthropics/claude-code-action@v1 in `checkHumanActor` with a + # 404 before `allowed_bots` is ever consulted. Upstream fix PR + # https://github.com/anthropics/claude-code-action/pull/1212 is closed + # and unmerged; see issue + # https://github.com/anthropics/claude-code-action/issues/1284 for the + # same class of bug. The dispatch carries the PAT owner as the actor. + cron-redispatch: + if: github.event_name == 'schedule' && github.repository == 'NVIDIA/Megatron-LM' + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.PAT }} + steps: + - name: Dispatch sync workflow via PAT + run: | + gh workflow run nightly-sync-main-to-dev.yml \ + --repo "${{ github.repository }}" \ + --ref main + + sync-main-to-dev: + if: github.event_name == 'workflow_dispatch' && github.repository == 'NVIDIA/Megatron-LM' + runs-on: ubuntu-latest + timeout-minutes: 360 + env: + GH_TOKEN: ${{ secrets.PAT }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + token: ${{ secrets.PAT }} + + - name: Configure Git + run: | + git config user.name "svcnvidia-nemo-ci" + git config user.email "svcnvidia-nemo-ci@nvidia.com" + + - name: Compute branch name + id: vars + run: | + DATE=$(date -u +%d_%m_%Y) + BRANCH="main2dev/${DATE}" + echo "branch=$BRANCH" >> "$GITHUB_OUTPUT" + echo "date=$DATE" >> "$GITHUB_OUTPUT" + + - name: Close previous unmerged sync PRs + run: | + OPEN_PRS=$(gh pr list \ + --repo "${{ github.repository }}" \ + --base dev \ + --state open \ + --json number,headRefName \ + --jq '.[] | select(.headRefName | startswith("main2dev/")) | .number') + + for PR_NUM in $OPEN_PRS; do + echo "Closing stale sync PR #${PR_NUM}" + gh pr close "$PR_NUM" \ + --repo "${{ github.repository }}" \ + --comment "Superseded by today's nightly sync." + done + + - name: Check if sync is needed + id: check-sync + run: | + git fetch origin main dev + AHEAD_COUNT=$(git rev-list --count origin/dev..origin/main) + echo "main is $AHEAD_COUNT commit(s) ahead of dev" + if [ "$AHEAD_COUNT" -eq 0 ]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "No changes to sync." + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + + - name: Run Claude Code to merge, fix, and iterate + if: steps.check-sync.outputs.skip != 'true' + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + github_token: ${{ secrets.PAT }} + prompt: | + You are an automated sync bot. Merge `main` into `dev`, create a + PR, ensure CI passes (fixing failures), and mark the PR ready. + There are 4 phases. You are NOT done until Phase 4 completes. + + REPO: ${{ github.repository }} + BRANCH: ${{ steps.vars.outputs.branch }} + DATE: ${{ steps.vars.outputs.date }} + + Read `.claude/skills/nightly-sync/SKILL.md` for the detailed + merge strategy, CI architecture, failure investigation procedures, + and known issues. Also read `.claude/skills/build-and-test/SKILL.md` + and `CLAUDE.md` for general CI and contribution guidelines. + + ## Hard Constraints + + **Exit condition:** You MUST run `gh pr ready ` before + exiting. That command is Phase 4. Do NOT exit after Phase 1, 2, + or 3 — not even if CI is "still running" or "stuck in queue." + Keep polling until it resolves, then act. + + **NO background tasks. Ever.** + You are running inside a single GitHub Actions step. The step + process owns your shell. When you stop issuing tool calls, the + step ends and the runner container is DESTROYED — every + background process dies with it and cannot resume. There is no + "future session" to wake up into. + + The following are strictly forbidden: + - `Bash` with `run_in_background: true` + - `Agent` with `run_in_background: true` + - `ScheduleWakeup` (nothing will ever wake up) + - Any shell command ending in `&`, or using `nohup`, `disown`, + or `setsid` to detach a process + - `tail -f` on a log produced by a backgrounded task + + Required shape for every long wait: ONE foreground Bash tool + call containing an inline `while true; do ... sleep ; done` + or `until ...; do sleep ; done` loop that BLOCKS inside + that single tool call and only returns when the wait is + resolved (success, failure, or a clearly-classified terminal + state). Do NOT break a long wait into many short polls with + conversation in between — that wastes `--max-turns` and + creates windows where the agent could forget the loop. + + **Source of truth for CI status:** + `gh pr view --repo $REPO --json statusCheckRollup` + This lists every required check — GitHub Actions jobs AND + external contexts (GitLab CI, `copy-pr-bot`, etc.). The + `gh api .../actions/runs//jobs` endpoint alone is + NOT sufficient — it misses external contexts. + + **Pre-existing failures:** MUST verify against recent dev CI + before classifying any failure as pre-existing. Run + `gh pr checks` on a recently merged dev PR. If the test passes + on dev, the failure is sync-caused and you must fix it. A + check that has never completed on your PR cannot be + pre-existing — wait for it to finish first. + + **Phase 4 gate — strict "all terminal, all green":** + Do NOT run `gh pr ready` until every non-exempt required check + in `statusCheckRollup` satisfies BOTH: + - `status == "COMPLETED"` (NOT `QUEUED`, `IN_PROGRESS`, + `PENDING`, `WAITING`, or `REQUESTED`), AND + - `conclusion` ∈ {`SUCCESS`, `SKIPPED`, `NEUTRAL`}. + A check stuck in a runner queue is NOT complete. Never + classify queued/in-progress jobs as "infrastructure-blocked" + and ship anyway — wait for them to reach a terminal + conclusion, then act on that result. When a check fails, + loop: diagnose → fix → commit → push → `/ok to test ` → + poll. Only exit the loop when the gate is satisfied on the + LATEST CI run against the current HEAD SHA. + + **Exempt checks (may be ignored for the Phase 4 gate):** + These categories are pre-merge policy signals, not + correctness signals, so their failure must not block the + sync bot from marking the PR ready for human review. + + - Approval / code-review: `codeowners-approval`, + `check-approval`, `multi-approval-bot-summary`, + `is-not-external-contributor`, any check whose name + contains `review` or `approval`. + - Code coverage: `Coverage (unit-test)`, `Coverage_Fake`, + any check whose name contains `codecov` or `coverage` + (case-insensitive). + - Docs: `build-docs / Build docs`, `build-docs-summary`, + any check whose name contains `build-docs`, `doc-build`, + `readthedocs`, or `sphinx`. + + Everything else — unit tests (`tests/unit_tests/...`), + integration tests (`gpt/...`, `moe/...`, etc.), `linting`, + `cicd-container-build`, `cicd-mbridge-testing`, + `Nemo_CICD_Test`, `copyright-check`, `pre-flight`, wheel + builds, etc. — is NOT exempt and must reach a terminal + green conclusion. + show_full_output: true + claude_args: | + --allowedTools "Bash,Read,Edit,Write,Grep,Glob,Agent" + --model "opus[1m]" + --effort max + --max-turns 1500 diff --git a/.github/workflows/oncall-assign.yml b/.github/workflows/oncall-assign.yml new file mode 100644 index 00000000000..6da0776ffc2 --- /dev/null +++ b/.github/workflows/oncall-assign.yml @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Oncall Assign + +on: + pull_request_target: + types: [ready_for_review] + branches: + - main + +permissions: + pull-requests: write + contents: read + +jobs: + assign-reviewer: + runs-on: ubuntu-latest + if: ${{ !github.event.pull_request.draft }} + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.10' + + - name: Install dependencies + run: pip install requests slack-sdk + + - name: Assign Reviewer + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }} diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml new file mode 100644 index 00000000000..0d5f774e441 --- /dev/null +++ b/.github/workflows/oncall-rotation.yml @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Oncall Rotation + +on: + schedule: + # Runs at 09:00 UTC every Wednesday + - cron: "0 9 * * 3" + workflow_dispatch: + +permissions: + contents: write + +jobs: + rotate-schedule: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + token: ${{ secrets.PAT }} + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Rotate Schedule + env: + # Token to read org team members. Needs read:org scope. + GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }} + # Slack token for updating the Slack usergroup + SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }} + run: | + pip install --no-cache-dir "uv<0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache + uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate + + - name: Commit and Push changes + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add .github/oncall_schedule.json + git commit -m "chore: rotate oncall schedule" || echo "No changes to commit" + git pull --rebase + git push origin HEAD:main diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml new file mode 100644 index 00000000000..6d619a8a1bc --- /dev/null +++ b/.github/workflows/release-docs.yml @@ -0,0 +1,115 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Release docs +on: + workflow_dispatch: + inputs: + dry-run: + description: Whether to run the workflow in dry-run mode + required: true + type: boolean + default: true + publish-as-latest: + description: Publish as Latest stable version. + required: false + type: boolean + default: true + docs-version-override: + description: Docs version if commit is not tagged + required: false + type: string + default: "" + update-version-picker: + description: Update version picker. + required: false + type: boolean + default: true + notify-emails: + description: Email addresses to send the notification to. Format as "me@me.com,you@you.com". + required: false + type: string + workflow_call: + inputs: + dry-run: + description: Whether to run the workflow in dry-run mode + required: true + type: boolean + default: true + publish-as-latest: + description: Publish as Latest stable version. + required: false + type: boolean + default: true + docs-version-override: + description: Docs version if commit is not tagged + required: false + type: string + default: "" + update-version-picker: + description: Update version picker. + required: false + type: boolean + default: true + notify-emails: + description: Email addresses to send the notification to. Format as "me@me.com,you@you.com". + required: false + type: string + build-docs-ref: + description: Reference to build the docs from + required: false + type: string + default: ${{ github.sha }} + +jobs: + build-docs: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0 + with: + ref: ${{ inputs.build-docs-ref }} + + publish-docs: + runs-on: ubuntu-latest + needs: [build-docs] + steps: + - uses: actions/checkout@v6 + with: + repository: NVIDIA-NeMo/FW-CI-templates + ref: v0.74.0 + path: FW-CI-templates + + - uses: ./FW-CI-templates/.github/actions/publish-docs + # This workflow runs either on main, or on a version tag. Any other git ref will lead + # to an error. + # If its on main, it will publish to "latest" directory in Akamai. + # If its on a versioned tag, it will extract the version number from the tag (strip `v` prefix) + # and publish to the versioned directory in Akamai. + with: + dry-run: ${{ inputs.dry-run }} + artifacts-name: docs-html + artifacts-path: _build/html + emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }} + overwrite-latest-on-tag: ${{ inputs.publish-as-latest }} + docs-version-override: ${{ inputs.docs-version-override }} + update-version-picker: ${{ inputs.update-version-picker }} + run-on-version-tag-only: ${{ github.ref_name != 'main' }} + request-name: megatron-core-publish-docs-${{ github.run_id }} + aws-region: ${{ vars.DOCS_AWS_REGION }} + aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + akamai-host: ${{ secrets.AKAMAI_HOST }} + akamai-client-token: ${{ secrets.AKAMAI_CLIENT_TOKEN }} + akamai-client-secret: ${{ secrets.AKAMAI_CLIENT_SECRET }} + akamai-access-token: ${{ secrets.AKAMAI_ACCESS_TOKEN }} + s3-target-root: ${{ secrets.S3_BUCKET_NAME }} + s3-target-path: megatron-core/developer-guide diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml new file mode 100644 index 00000000000..97def9dc2f7 --- /dev/null +++ b/.github/workflows/release-freeze.yml @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: "Code freeze" + +on: + workflow_dispatch: + inputs: + release-type: + type: choice + description: Type of release + options: + - major + - minor + freeze-commit: + type: string + description: Commit SHA to use for cut-off + required: false + default: main + dry-run: + type: boolean + description: Dry-run of code-freeze + required: false + default: true +jobs: + code-freeze: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.86.0 + with: + library-name: Megatron-Core + python-package: megatron.core + release-type: ${{ inputs.release-type }} + freeze-commit: ${{ inputs.freeze-commit }} + dry-run: ${{ inputs.dry-run }} + release-branch-prefix: core_ + use-pat: true + secrets: + SLACK_WEBHOOK: ${{ inputs.dry-run && secrets.SLACK_CI_CHANNEL_WEBHOOK ||secrets.SLACK_MAIN_CHANNEL_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }} + PAT: ${{ secrets.PAT }} diff --git a/.github/workflows/release-nightly-docs.yml b/.github/workflows/release-nightly-docs.yml new file mode 100644 index 00000000000..89ceb1fbcd8 --- /dev/null +++ b/.github/workflows/release-nightly-docs.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Release Nightly Docs + +on: + schedule: + - cron: "0 10 * * *" + +jobs: + call-release-docs: + uses: ./.github/workflows/release-docs.yml + with: + dry-run: false + publish-as-latest: false + docs-version-override: "nightly" + update-version-picker: false + secrets: inherit diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 00000000000..7bf36ad71bc --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,162 @@ +# Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: "Build, validate, and release Megatron-Core" + +on: + push: + branches: + - main + - "pull-request/[0-9]+" + - "deploy-release/*" + merge_group: + types: [checks_requested] + workflow_dispatch: + inputs: + release-ref: + description: Ref (SHA or branch name) to release + required: true + type: string + dry-run: + description: Compute the release but do not publish wheel, GH release, or docs. + required: true + default: true + type: boolean + create-gh-release: + description: Create a GitHub release + required: true + default: true + type: boolean + generate-changelog: + description: Generate changelog + required: false + default: true + type: boolean + publish-docs: + description: Publish docs + required: false + default: true + type: boolean + version-bump-branch: + description: Branch for version bump + required: true + type: string + gh-release-from-tag: + description: Tag of previous release for changelog builder + required: false + type: string + default: "" + +permissions: + id-token: write + contents: write + pull-requests: write + +defaults: + run: + shell: bash -x -e -u -o pipefail {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} + cancel-in-progress: ${{ github.event_name == 'push' }} + +jobs: + pre-flight: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.94.1 + if: github.repository == 'NVIDIA/Megatron-LM' && github.event_name != 'workflow_dispatch' + + bump: + needs: [pre-flight] + if: | + !cancelled() && !failure() + && github.repository == 'NVIDIA/Megatron-LM' + && !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_merge_group == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_bump.yml@v1.0.0 + with: + release-ref: ${{ inputs.release-ref || github.sha }} + validate-only: ${{ github.event_name != 'workflow_dispatch' }} + dry-run: ${{ inputs.dry-run || false }} + version-bump-branch: ${{ inputs.version-bump-branch || github.ref_name }} + restrict-to-admins: true + app-id: ${{ vars.BOT_ID }} + library-name: Megatron Core + bump-targets: | + [ + {"python-package": "megatron.core", "src-dir": ""}, + {"python-package": "megatron_fsdp", "src-dir": "megatron/core/distributed/fsdp/src/"} + ] + secrets: inherit # pragma: allowlist secret + + build-test-publish-wheels: + needs: [pre-flight, bump] + if: | + !cancelled() && !failure() && needs.bump.result == 'success' + && github.repository == 'NVIDIA/Megatron-LM' + && ( + github.event_name == 'workflow_dispatch' + || !(needs.pre-flight.outputs.docs_only == 'true' + || needs.pre-flight.outputs.is_deployment_workflow == 'true') + ) + uses: ./.github/workflows/_build_test_publish_wheel.yml + with: + ref: ${{ inputs.release-ref || github.sha }} + dry-run: ${{ inputs.dry-run || false }} + no-publish: ${{ github.event_name != 'workflow_dispatch' || inputs.dry-run }} + secrets: inherit # pragma: allowlist secret + + finalize: + needs: [bump, build-test-publish-wheels] + if: | + github.repository == 'NVIDIA/Megatron-LM' + && (success() || !failure()) + && !cancelled() + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_finalize.yml@v1.0.0 + with: + release-ref: ${{ inputs.release-ref || github.sha }} + release-version: ${{ needs.bump.outputs.release-version }} + library-name: Megatron Core + pypi-name: megatron-core + validate-only: ${{ github.event_name != 'workflow_dispatch' }} + dry-run: ${{ inputs.dry-run || false }} + create-gh-release: ${{ inputs.create-gh-release || true }} + gh-release-tag-prefix: core_ + gh-release-use-changelog-builder: ${{ inputs.generate-changelog || false }} + gh-release-from-tag: ${{ inputs.gh-release-from-tag || '' }} + publish-docs: ${{ inputs.publish-docs || true }} + docs-target-path: megatron-core/developer-guide + publish-as-latest: true + run-on-version-tag-only: ${{ github.ref_name != 'main' }} + app-id: ${{ vars.BOT_ID }} + secrets: inherit # pragma: allowlist secret + + release-summary: + needs: [pre-flight, bump, build-test-publish-wheels, finalize] + if: github.repository == 'NVIDIA/Megatron-LM' && !cancelled() + runs-on: ubuntu-latest + steps: + - name: Result + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --repo ${{ github.repository }} --json jobs --jq '[.jobs[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "action_required")] | length') + + if [ "${FAILED_JOBS:-0}" -eq 0 ]; then + echo "✅ All previous jobs completed successfully" + exit 0 + else + echo "❌ Found $FAILED_JOBS failed job(s)" + gh run view $GITHUB_RUN_ID --repo ${{ github.repository }} --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "action_required") | .name' + exit 1 + fi diff --git a/.github/workflows/review-trigger.yml b/.github/workflows/review-trigger.yml new file mode 100644 index 00000000000..7375e605aff --- /dev/null +++ b/.github/workflows/review-trigger.yml @@ -0,0 +1,28 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Lightweight workflow that triggers on review approval, otherwise there is no access to right secret. +# No secrets needed — just signals auto-swap-labels.yml via workflow_run. + +name: Review Trigger + +on: + pull_request_review: + types: [submitted] + +jobs: + signal: + runs-on: ubuntu-latest + if: >- + github.event.review.state == 'approved' && + github.event.pull_request.base.ref == 'main' && + github.repository == 'NVIDIA/Megatron-LM' + steps: + - name: Save PR number + run: | + mkdir -p pr + echo "${{ github.event.pull_request.number }}" > pr/number + - name: Upload PR number + uses: actions/upload-artifact@v6 + with: + name: pr-number + path: pr/ diff --git a/.github/workflows/sync-skills.yml b/.github/workflows/sync-skills.yml new file mode 100644 index 00000000000..75b8c20dca0 --- /dev/null +++ b/.github/workflows/sync-skills.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Sync skills → agent dirs + +on: + workflow_dispatch: + push: + branches: + - main + paths: + - "skills/**" + - "AGENTS.md" + +jobs: + sync: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_sync_skills.yml@v0.91.0 + secrets: + PAT: ${{ secrets.PAT }} diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml new file mode 100644 index 00000000000..7f32ac55c57 --- /dev/null +++ b/.github/workflows/sync-team-usergroups.yml @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Sync GitHub Teams to Slack User Groups + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +jobs: + sync-usergroups: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Sync Teams to User Groups + env: + GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }} + SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }} + run: | + pip install --no-cache-dir "uv<0.9.29" + uv venv .venv + uv cache clean + uv sync --no-cache + uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py diff --git a/.github/workflows/trigger-mbridge-tests.yml b/.github/workflows/trigger-mbridge-tests.yml new file mode 100644 index 00000000000..023851e966a --- /dev/null +++ b/.github/workflows/trigger-mbridge-tests.yml @@ -0,0 +1,42 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Trigger MBridge Tests +on: + workflow_dispatch: + inputs: + mbridge_ref: + description: "MBridge branch/ref to trigger" + required: false + type: string + default: "main" + test_suite: + description: "Test suite to run" + required: false + type: choice + options: + - "all" + - "unit-only" + - "functional-only" + default: "all" + +jobs: + trigger-mbridge-tests: + runs-on: ubuntu-latest + steps: + - name: Trigger MBridge tests + uses: convictional/trigger-workflow-and-wait@v1.6.5 + with: + owner: NVIDIA-NeMo + repo: Megatron-Bridge + workflow_file_name: cicd-main.yml + github_token: ${{ secrets.PAT }} + ref: ${{ inputs.mbridge_ref }} + wait_interval: 60 + propagate_failure: true + client_payload: | + { + "mcore_ref": "${{ github.sha }}", + "test_suite": "${{ inputs.test_suite }}", + "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } diff --git a/.gitignore b/.gitignore index cac3499524d..5556d1d5a4a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,20 @@ build *~ slurm* logs +.vscode +local/ +.gitmodules +wandb/ +onelogger.log +onelogger.err +.venv +runs/ +/test_cases/ +**/dist/ + +# Sphinx documentation +docs/_build +docs/apidocs + +# Git worktrees +.worktrees/ \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3cd1c2f2e69..2eb1b43be0c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,299 +1,284 @@ -image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov +.merge_train_rule: &merge_train_rule + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + INTEGRATION_TEST_SCOPE: mr + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: mr-slim + FUNCTIONAL_TEST_REPEAT: 1 + FUNCTIONAL_TEST_TIME_LIMIT: 2700 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -stages: - - test - - cleanup +workflow: + rules: + # Do not trigger for forks + - if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm") + when: never -variables: &VARS - SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron" - DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data" - PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov - PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate - TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: L0 # Can specify levels - TESTS_TO_RUN_AFTER_MERGING: L0 # Can specify levels - TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests - TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/ - DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') -unit_tests: - tags: - - docker_local_runner - stage: test - script: - - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests - coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' - artifacts: - paths: - - coverage - expire_in: 30 days - only: - - merge_requests + # ci-branches only for schedule + - if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule" + when: never -.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher - tags: - - ssh_selene_runner - stage: test - script: &selene-test-resume-launcher-script - - echo "Running selene resume from checkpoint test. " - - pwd - - export BUILD_DIR=`pwd` - - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes - - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs." - - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS - - export DATA_DIR=$DATA_DIR - - echo "Run name is $RUN_NAME" - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/* - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/* - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/* - - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME - - export LOGS_DIR=$BASE_DIR/logs - - export RESULTS_DIR=$BASE_DIR/results - - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints - - echo "Submitting job" - - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES` - - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }'); - - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID - - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n" - "----------WAITING FOR SLURM JOB TO BEGIN-----------\n" - "---------------------------------------------------\n" - "$(scontrol show job=${SLURM_JOBID})\n" - "---------------------------------------------------\n" - # Gitlab logs collapsible section markers - - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K" - # Follow output of the job - - echo "Finished job" - - export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1) - - echo "Slurm job state $SLURM_STATE" - - if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi - - source $PYTHON_VIRTUAL_ENV - - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs." - - echo "Completed the job" - rules: - - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT - when: always - - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING' - when: always - - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED - when: always - allow_failure: false + # For schedules pipelines + - if: $CI_PIPELINE_SOURCE == "schedule" + auto_cancel: + on_new_commit: none -.selene_test_launcher: &selene-test-launcher - tags: - - ssh_selene_runner - stage: test - script: &selene-test-launcher-script - - echo "Running selene test" - - echo "$CI_MERGE_REQUEST_APPROVED" - - pwd - - export BUILD_DIR=`pwd` - - RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps - - if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi - - export $RUN_NAME - - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs." - - export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE - - export MBS GBS - - export DATA_DIR=$DATA_DIR - - echo "Run name is $RUN_NAME" - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs - - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/* - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/* - - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/* - - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME - - export LOGS_DIR=$BASE_DIR/logs - - export RESULTS_DIR=$BASE_DIR/results - - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints - - echo "Submitting job" - - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS` - - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }'); - - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID - - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n" - "----------WAITING FOR SLURM JOB TO BEGIN-----------\n" - "---------------------------------------------------\n" - "$(scontrol show job=${SLURM_JOBID})\n" - "---------------------------------------------------\n" - # Gitlab logs collapsible section markers - - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K" - # Follow output of the job - - echo "Finished job" - - echo "Slurm log dump start ------------------------------------------------------------" - - cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/* - - echo "Slurm log dump end --------------------------------------------------------------" - - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID - - if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi - - source $PYTHON_VIRTUAL_ENV - - | - if [[ "$DISPLAY_OUTPUT" == "True" ]]; then - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME - fi - - | - if [[ $USE_TE -ne 1 ]]; then - echo "Checking against ground truth file" - export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs." - fi - - echo "Completed the job" - rules: - - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT - when: always - - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING' - when: always - - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED - when: always - allow_failure: false + # For manual pipelines (GitLab UI) + - if: $CI_PIPELINE_SOURCE == "web" -train.te_gpt3.345m_tp2_pp2_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - USE_TE: 1 - TP_SIZE: 2 - PP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "50:00" - TEST_LEVEL: L0 + # For pipelines created via the REST API (personal access token) + - if: $CI_PIPELINE_SOURCE == "api" -train.gpt3.345m_tp4_pp1_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - USE_TE: 0 - TP_SIZE: 4 - PP_SIZE: 1 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For trigger pipelines + - if: $CI_PIPELINE_SOURCE == "trigger" -train.gpt3.345m_tp2_pp2_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - USE_TE: 0 - TP_SIZE: 2 - PP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For push to main + - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/) + variables: + UNIT_TEST: "no" + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: mr + FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_TIME_LIMIT: 3600 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" + auto_cancel: + on_new_commit: interruptible -train.gpt3.345m_tp1_pp2_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - USE_TE: 0 - TP_SIZE: 1 - PP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For merge-trains that need to be fast-tracked + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/ + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "no" + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -train.gpt3.345m_tp1_pp4_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - USE_TE: 0 - TP_SIZE: 1 - PP_SIZE: 4 - VP_SIZE: 1 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For normal merge-trains + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' + variables: *merge_train_rule -resume.checkpoint.gpt3.345m_tp1_pp2_1node: - <<: *selene-test-resume-checkpoint-launcher - variables: - <<: [*VARS] - RUN_MODEL: gpt3 - TP_SIZE: 1 - PP_SIZE: 2 - NUM_NODES: 1 - TIME_LIMIT: "30:00" - TEST_LEVEL: L0 + # For MRs with integration suite + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/ + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "yes" + INTEGRATION_TEST_SCOPE: mr + FUNCTIONAL_TEST: "no" + FUNCTIONAL_TEST_SCOPE: mr-slim + FUNCTIONAL_TEST_REPEAT: 1 + FUNCTIONAL_TEST_TIME_LIMIT: 2700 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -train.bert.345m_tp4_pp1_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: bert - TP_SIZE: 4 - PP_SIZE: 1 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For MRs with nightly + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: nightly + FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_TIME_LIMIT: 2700 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -train.bert.345m_tp2_pp2_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: bert - TP_SIZE: 2 - PP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For MRs with weekly + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: weekly + FUNCTIONAL_TEST_REPEAT: 1 + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" + FUNCTIONAL_TEST_TIME_LIMIT: 9000 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -train.bert.345m_tp1_pp2_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: bert - TP_SIZE: 1 - PP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # For MRs with heavy suite + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/ + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: mr + FUNCTIONAL_TEST_REPEAT: 1 + FUNCTIONAL_TEST_TIME_LIMIT: 2700 + CLUSTER_A100: "" + CLUSTER_H100: "" + PUBLISH: "no" -train.bert.345m_tp1_pp4_1node_50steps: - <<: *selene-test-launcher - variables: - <<: [*VARS] - RUN_MODEL: bert - TP_SIZE: 1 - PP_SIZE: 4 - VP_SIZE: 2 - NUM_NODES: 1 - MAX_STEPS: 50 - TIME_LIMIT: "20:00" - TEST_LEVEL: L0 + # Default MRs + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' + variables: + UNIT_TEST: "yes" + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 30 + INTEGRATION_TEST: "no" + FUNCTIONAL_TEST: "no" + PUBLISH: "no" -resume.checkpoint.bert.345m_tp1_pp2_1node: - <<: *selene-test-resume-checkpoint-launcher - variables: - <<: [*VARS] - RUN_MODEL: bert - TP_SIZE: 1 - PP_SIZE: 2 - NUM_NODES: 1 - TIME_LIMIT: "30:00" - TEST_LEVEL: L0 + - when: never -cleanup.selene: - tags: - - ssh_selene_runner - stage: cleanup - variables: - <<: [*VARS] - script: - - set +e - - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l` - - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf - - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene" - allow_failure: true - rules: - - when: always + auto_cancel: + on_new_commit: interruptible + +stages: + - build + - test + - integration_tests + - functional_tests + - publish + +default: + interruptible: true + retry: + max: 2 + when: runner_system_failure + +variables: + BUILD: + value: "yes" + UNIT_TEST: + value: "yes" + options: + - "yes" + - "no" + description: To run the funtional test suite + UNIT_TEST_REPEAT: + value: "1" + description: "Number of repetitions" + UNIT_TEST_TIMEOUT: + value: "30" + description: Timeout (minutes) for Unit tests (all repeats) + INTEGRATION_TEST: + value: "yes" + options: + - "yes" + - "no" + description: To run the integration test suite + INTEGRATION_TEST_SCOPE: + value: "mr" + options: + - "mr" + - "nightly" + - "weekly" + - "pre-release" + - "release" + description: "Testsuite to run (only for INTEGRATION_TEST=yes)" + INTEGRATION_TEST_TIME_LIMIT: + value: "900" + description: "Timeout in seconds per test" + INTEGRATION_TEST_CASES: + value: "all" + description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." + FUNCTIONAL_TEST: + value: "yes" + options: + - "yes" + - "no" + description: To run the funtional test suite + FUNCTIONAL_TEST_SCOPE: + value: "mr" + options: + - "mr" + - "nightly" + - "weekly" + - "pre-release" + - "release" + description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)" + FUNCTIONAL_TEST_REPEAT: + value: "5" + description: "Number of repetitions per test" + FUNCTIONAL_TEST_TIME_LIMIT: + value: "2700" + description: "Timeout in seconds per test" + FUNCTIONAL_TEST_CASES: + value: "all" + description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." + FUNCTIONAL_TEST_NAME: + description: "Name of functional test run (only for pre-release and release)" + value: "$$CI_COMMIT_SHA" + FUNCTIONAL_TEST_RECORD_CHECKPOINTS: + value: "no" + description: "Record golden checkpoints" + options: + - "yes" + - "no" + CLUSTER_A100: + value: "dgxa100_dracooci" + options: + - "dgxa100_dracooci" + - "dgxa100_dracooci-ord" + description: "Cluster for A100 workloads" + CLUSTER_H100: + value: "dgxh100_coreweave" + options: + - "dgxh100_coreweave" + - "dgxh100_eos" + description: "Cluster for H100 workloads" + CLUSTER_GB200: + value: "dgxgb200_oci-hsg" + options: + - "dgxgb200_oci-hsg" + description: "Cluster for H100 workloads" + PUBLISH: + value: "no" + options: + - "yes" + - "no" + description: Build and publish a wheel to PyPi + PUBLISH_COMMIT: + value: "$$CI_COMMIT_SHA" + description: Which commit to publish + PUBLISH_VERSION_BUMP_BRANCH: + value: "$$CI_COMMIT_BRANCH" + description: Which branch to target for version bump + PUBLISH_SCOPE: + value: "code-freeze" + options: + - "code-freeze" + - "release" + - "review-reminder" + - "upgrade-dependencies" + description: Type of publish (freeze or final release) + + # CI wide variables + CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts + CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev + CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci + UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility + TE_GIT_REF: "" + +include: + - .gitlab/stages/00.pre.yml + - .gitlab/stages/01.build.yml + - .gitlab/stages/02.test.yml + - .gitlab/stages/03.integration-tests.yml + - .gitlab/stages/04.functional-tests.yml + - .gitlab/stages/05.publish.yml diff --git a/.gitlab/labeler-config.yml b/.gitlab/labeler-config.yml new file mode 100644 index 00000000000..2c37345c0e6 --- /dev/null +++ b/.gitlab/labeler-config.yml @@ -0,0 +1,33 @@ +CI: + - .gitlab-ci.yml + - Dockerfile.ci.lts + - Dockerfile.ci.dev + - .github/** + - .gitlab/** + +Datasets: + - megatron/core/datasets/** + +BERT: + - megatron/core/models/bert/** + +GPT: + - megatron/core/models/gpt/** + +Dist-Ckpt: + - megatron/core/dist_checkpointing + +Dist-Opt: + - megatron/core/optimizer/distrib_optimizer + +Inference: + - megatron/core/inference + +MoE: + - megatron/core/transformer/moe + +Tests: + - tests/** + +ParallelState: + - megatron/core/parallel_state.py diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh new file mode 100644 index 00000000000..0f34b838384 --- /dev/null +++ b/.gitlab/scripts/build.sh @@ -0,0 +1,67 @@ +#! /bin/bash + +set -x +env +eval "IMAGE=\$$IMAGE" + +# Start a named container in detached mode +docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity' +docker cp tests/. download_test_data:/workdir/tests +docker exec download_test_data bash -c ' + ls -al /workdir/ + pip install --no-cache-dir click requests + python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets +' +docker cp download_test_data:/workdir/assets ./ +docker rm -f download_test_data + +docker context create tls-environment +docker buildx create --name container --driver=docker-container --use tls-environment + +ADDITIONAL_PARAMS=() + +CI_COMMIT_BRANCH="${CI_COMMIT_BRANCH:-$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" + +if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then + ADDITIONAL_PARAMS+=("--pull") +fi + +CI_COMMIT_BRANCH=$(echo "$CI_COMMIT_BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9._-]/-/g') +ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM},mode=max") +ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_COMMIT_BRANCH}-${PLATFORM}") +ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:main-${PLATFORM}") +ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:dev-${PLATFORM}") + +ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}-${PLATFORM}") + +if [[ -n "$CI_MERGE_REQUEST_IID" ]]; then + ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM},mode=max") + ADDITIONAL_PARAMS+=("--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID}-${PLATFORM}") + ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}-${PLATFORM}") +fi + +if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then + ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly-${PLATFORM}") +fi + +if [[ -n "$TE_GIT_REF" ]]; then + ADDITIONAL_PARAMS+=("--build-arg TE_COMMIT=${TE_GIT_REF}") +fi + +echo $(git rev-parse HEAD) + +JET_API_VERSION=$(curl -s -u "$ARTIFACTORY_USER:$ARTIFACTORY_TOKEN" "https://sc-hw-artf.nvidia.com/artifactory/api/pypi/hw-joc-pypi/simple/jet-api/" | grep -o 'href="../../jet-api/[0-9.]*/' | sed 's|href="../../jet-api/||;s|/||' | sort -V -r | head -n1) + +DOCKER_BUILDKIT=1 docker build \ + --secret id=JET_INDEX_URLS \ + --secret id=LOGGER_INDEX_URL \ + --target $STAGE \ + -f docker/$FILE \ + -t ${IMAGE}:${CI_PIPELINE_ID}-${PLATFORM} \ + --builder=container \ + --build-arg JET_API_VERSION=$JET_API_VERSION \ + --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ + --provenance=false \ + --push \ + --progress plain \ + ${ADDITIONAL_PARAMS[@]} . diff --git a/.gitlab/scripts/check_imports.py b/.gitlab/scripts/check_imports.py new file mode 100644 index 00000000000..f46987d8d87 --- /dev/null +++ b/.gitlab/scripts/check_imports.py @@ -0,0 +1,208 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!/usr/bin/env python3 +""" +Import checker script for megatron.hub package. + +This script recursively discovers all Python modules in the specified package +and attempts to import them, reporting any import errors. +""" + +import importlib +import os +import sys +import traceback +from typing import Dict, List, Tuple + +import click + + +class ImportChecker: + """Check imports for all modules in a package.""" + + def __init__(self, package_name: str = "megatron.core", verbose: bool = False): + self.package_name = package_name + self.success_count = 0 + self.failure_count = 0 + self.graceful_count = 0 + self.skipped_count = 0 + self.failures: Dict[str, str] = {} + self.successes: List[str] = [] + self.graceful_failures: Dict[str, str] = {} + self.skipped: List[str] = [] + + # Modules to skip (known problematic ones) + self.skip_patterns = { + "__pycache__", + ".pytest_cache", + ".git", + "test_", + "_test", + } + + # Add current directory to Python path if not already there + current_dir = os.getcwd() + if current_dir not in sys.path: + sys.path.insert(0, current_dir) + + def should_skip_module(self, module_name: str) -> bool: + """Check if a module should be skipped.""" + for pattern in self.skip_patterns: + if pattern in module_name: + return True + return False + + def discover_modules(self, package_path: str) -> List[str]: + """Discover all Python modules in the given package path.""" + modules = [] + + package = importlib.import_module(package_path) + package_path = package.__path__[0] + + # Walk through all Python files + for root, dirs, files in os.walk(package.__path__[0]): + # Skip hidden directories and __pycache__ + dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__"] + + for file in files: + if file.endswith(".py") and not file.startswith("."): + # Convert file path to module name + rel_path = os.path.relpath(os.path.join(root, file), package_path) + module_parts = rel_path.replace(os.sep, ".").replace(".py", "") + + # Handle __init__.py files + if module_parts.endswith(".__init__"): + module_parts = module_parts[:-9] # Remove .__init__ + + full_module_name = ( + f"{self.package_name}.{module_parts}" + if module_parts + else self.package_name + ) + + if not self.should_skip_module(full_module_name): + modules.append(full_module_name) + + # Remove duplicates and sort + modules = sorted(list(set(modules))) + + return modules + + def import_module(self, module_name: str) -> Tuple[str, str]: + """ + Try to import a module and return success status and error message. + + Returns: + Tuple of (status: str, error_message: str) + status can be: "success", "graceful", or "failed" + """ + try: + if module_name in sys.modules: + del sys.modules[module_name] + + importlib.import_module(module_name) + return "success", "" + + except Exception: + tb = traceback.format_exc() + if "UnavailableError" in tb: + return "graceful", "UnavailableError detected during import" + return "failed", f"{str(tb)}" + + def check_all_imports(self): + """Check imports for all discovered modules.""" + print(f"Discovering modules in package '{self.package_name}'...") + modules = self.discover_modules(self.package_name) + + if not modules: + print("No modules found!") + return + + print(f"Found {len(modules)} modules to check") + print("=" * 60) + + for i, module_name in enumerate(modules, 1): + status, error_msg = self.import_module(module_name) + + if status == "success": + self.success_count += 1 + self.successes.append(module_name) + elif status == "graceful": + self.graceful_count += 1 + self.graceful_failures[module_name] = error_msg + else: # failed + self.failure_count += 1 + self.failures[module_name] = error_msg + + """Print a summary of the import check results.""" + total = ( + self.success_count + + self.failure_count + + self.graceful_count + + self.skipped_count + ) + + print("\n" + "=" * 60) + print("IMPORT CHECK SUMMARY") + print("=" * 60) + print(f"Total modules checked: {total}") + print( + f"Successful imports: {self.success_count} ({self.success_count / total * 100:.1f}%)" + ) + print( + f"Gracefully handled: {self.graceful_count} ({self.graceful_count / total * 100:.1f}%)" + ) + print( + f"Failed imports: {self.failure_count} ({self.failure_count / total * 100:.1f}%)" + ) + if self.skipped_count > 0: + print( + f"Skipped modules: {self.skipped_count} ({self.skipped_count / total * 100:.1f}%)" + ) + + if self.graceful_failures: + print(f"\n🟡 GRACEFULLY HANDLED ({len(self.graceful_failures)}):") + print("-" * 40) + + if self.failures: + print(f"\n❌ FAILED IMPORTS ({len(self.failures)}):") + print("-" * 40) + for module_name, error_msg in self.failures.items(): + print(f"\n• {module_name}") + # Show only the first few lines of error to keep output manageable + error_lines = error_msg.split("\n") + for line in error_lines: + # if self.package_name.replace(".", os.sep) not in line: + # continue + if line.strip(): + print(f" {line}") + + return self.failure_count == 0 + + +@click.command() +@click.option( + "--package-name", + required=True, + help="Package name to check imports for", +) +def main(package_name: str): + """Main entry point.""" + checker = ImportChecker(package_name=package_name) + successful = checker.check_all_imports() + exit(0 if successful else 1) + + +if __name__ == "__main__": + main() diff --git a/.gitlab/scripts/fetch-legacy-suite.sh b/.gitlab/scripts/fetch-legacy-suite.sh new file mode 100644 index 00000000000..775a0c0ddd3 --- /dev/null +++ b/.gitlab/scripts/fetch-legacy-suite.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -euxo pipefail + +# Default values +MCORE_REPO="https://github.com/nvidia/megatron-lm.git" +MCORE_MR_COMMIT="main" +MCORE_BACKWARDS_COMMIT="" + +# Parse command line arguments +usage() { + cat < labels + - gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true + - cat labels + after_script: + - | + source labels + curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT + +pre:maybe_cherry_pick_to_main: + rules: + - if: "$CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' && $CI_MERGE_REQUEST_LABELS =~ /mirror-to-main/" + - when: never + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + stage: .pre + image: nentangso/alpine-git-curl-jq + variables: + GIT_STRATEGY: "clone" + script: + - | + set -x + MR_ID=$CI_MERGE_REQUEST_IID + TARGET_BRANCH="cp/$MR_ID-into-main" + TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$TARGET_BRANCH)" != "" ]] && echo true || echo false) + + if [[ "$TARGET_BRANCH_EXISTS_OK" == "true" ]]; then + echo Target branch already exists, will not cherry-pick again. + exit 0 + fi + + MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}") + + LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"') + AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"') + AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"') + TITLE=$(echo -E $MR | jq '.title' | tr -d '"') + MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"') + + git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_PATH.git" + git remote add mr-origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH.git" + + git config --global user.email "mcore-bot@nvidia.com" + git config --global user.name "Mcore Bot" + + git fetch origin dev + git fetch mr-origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME + git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME + START_COMMIT=$(git merge-base origin/dev mr-origin/$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME) + END_COMMIT=$(git rev-parse HEAD) + + git fetch origin main + git checkout main + git checkout -b $TARGET_BRANCH + + git cherry-pick $START_COMMIT..$END_COMMIT + git push -u origin $TARGET_BRANCH + + curl \ + --header "PRIVATE-TOKEN: $PAT" \ + --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \ + -d "source_branch=$TARGET_BRANCH" \ + -d "target_branch=main" \ + -d "title=cp MR !$MR_ID from dev: \`$TITLE\`" \ + -d "labels=cherry-picked-from-dev" \ + -d "reviewer_ids=$AUTHOR_ID" \ + -d "milestone_id=$MILESTONE_ID" \ + -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,

we've cherry picked \`$TITLE (!$MR_ID)\` into \`main\` for you! 🚀

Please review and approve this cherry pick by your convenience\!" + +pre:maybe_cherry_pick_commit: + rules: + - if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"' + - when: never + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + stage: .pre + image: nentangso/alpine-git-curl-jq + variables: + GIT_STRATEGY: "clone" + script: + - set -x + - set +e + - SHA=$(git rev-list --no-merges -n 1 HEAD) + - MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) + - MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' ) + - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" + - git config --global user.email "mcore-bot@nvidia.com" + - git config --global user.name "Mcore Bot" + - | + MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}") + + LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"') + AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"') + AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"') + TITLE=$(echo -E $MR | jq '.title' | tr -d '"') + MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"') + TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*') + + if [[ $TARGET_BRANCHES == "" ]]; then + echo Nothing to cherry pick + exit 0 + fi + + echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do + TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false) + + if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then + echo Release branch does not yet exist, will not cherry-pick + continue + fi + + ( + git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH + git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH + git cherry-pick $SHA + git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH + git checkout main + ) + + CHERRYPICK_SUCCESSFUL=$? + + if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then + curl \ + --header "PRIVATE-TOKEN: $PAT" \ + --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \ + -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \ + -d "target_branch=$RELEASE_BRANCH" \ + -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \ + -d "labels=cherry-pick" \ + -d "reviewer_ids=$AUTHOR_ID" \ + -d "milestone_id=$MILESTONE_ID" \ + -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,

we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀

Please review and approve this cherry pick by your convenience\!" + + else + URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID + + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "beep boop 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'" + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK} + + fi + + done + interruptible: false + +pre:check_milestone: + extends: [.pre_rules] + image: badouralix/curl-jq + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - | + MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone') + - | + if [[ "$MILESTONE" == "null" ]]; then + LATEST_MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/milestones?state=active&order_by=due_date&sort=desc" | jq '.[0].id') + curl --request PUT --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data "milestone_id=${LATEST_MILESTONE}" + echo "Applied latest milestone (ID: ${LATEST_MILESTONE}) to this MR" + fi + +pre:check_status_of_main: + extends: [.pre_rules] + image: python:3.10 + timeout: 7 days + variables: + KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi + KUBERNETES_SERVICE_CPU_REQUEST: 8 + KUBERNETES_SERVICE_CPU_LIMIT: 12 + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - pip install --no-cache-dir python-gitlab click + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" + rules: + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/ + when: never + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' + when: always + - when: never diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml new file mode 100644 index 00000000000..aef3d64b014 --- /dev/null +++ b/.gitlab/stages/01.build.yml @@ -0,0 +1,148 @@ +.build_rules: + rules: + - if: $BUILD == "no" + when: never + - when: on_success + stage: test + +.build_image: + extends: [.build_rules, .dind_rules] + stage: build + tags: + - arch/${PLATFORM} + - origin/jet-fleet + - env/prod + - purpose/builder-large + services: + - name: docker:24.0.5-dind + variables: + HEALTHCHECK_TCP_PORT: "2376" + timeout: 180m + variables: + DOCKER_HOST: tcp://docker:2376 + DOCKER_TLS_CERTDIR: "/certs" + DOCKER_TLS_VERIFY: 1 + DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client" + STAGE: jet + MCORE_BACKWARDS_REF: core_r0.14.0 + KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi + SHARED_PATH: /builds/$CI_PROJECT_PATH/shared + script: + - eval PUBLISH_COMMIT=$PUBLISH_COMMIT + - apk add bash curl git + - export TE_GIT_REF=$TE_GIT_REF + - export GH_TOKEN=$GH_TOKEN + - bash .gitlab/scripts/build.sh + + - git fetch origin $MCORE_BACKWARDS_REF + - MCORE_BACKWARDS_COMMIT=$(git rev-parse FETCH_HEAD) + + - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env + - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env + - cat build.env + retry: + max: 2 + artifacts: + reports: + dotenv: build.env + +test:pre_build_image: + extends: [.build_image] + parallel: + matrix: + - IMAGE: CI_MCORE_LTS_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: lts + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 + PLATFORM: amd64 + - IMAGE: CI_MCORE_LTS_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: lts + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 + PLATFORM: arm64 + - IMAGE: CI_MCORE_DEV_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: dev + BASE_IMAGE: nvcr.io/nvidia/pytorch:26.04-py3 + PLATFORM: amd64 + - IMAGE: CI_MCORE_DEV_IMAGE + FILE: Dockerfile.ci.dev + IMAGE_TYPE: dev + BASE_IMAGE: nvcr.io/nvidia/pytorch:26.04-py3 + PLATFORM: arm64 + - IMAGE: UTILITY_IMAGE + FILE: Dockerfile.linting + BASE_IMAGE: python:3.10 + PLATFORM: amd64 + - IMAGE: UTILITY_IMAGE + FILE: Dockerfile.linting + BASE_IMAGE: python:3.10 + PLATFORM: arm64 + +test:build_nemo_image: + extends: [.build_image] + variables: + IMAGE: CI_NEMO_IMAGE + FILE: Dockerfile.ci.nemo + BASE_IMAGE: nvcr.io/nvidian/nemo:nightly + PLATFORM: amd64 + rules: + - if: $FUNCTIONAL_TEST == "yes" || $INTEGRATION_TEST == "yes" || $CI_COMMIT_BRANCH == "ci-rebuild-mcore-nemo-image" + when: on_success + +test:build_image: + needs: [test:pre_build_image] + extends: [.build_rules, .dind_rules] + parallel: + matrix: + - IMAGE: CI_MCORE_LTS_IMAGE + - IMAGE: CI_MCORE_DEV_IMAGE + - IMAGE: UTILITY_IMAGE + stage: build + tags: + - arch/amd64 + - origin/jet-fleet + - env/prod + - purpose/builder-large + services: + - name: docker:24.0.5-dind + variables: + HEALTHCHECK_TCP_PORT: "2376" + timeout: 180m + variables: + DOCKER_HOST: tcp://docker:2376 + DOCKER_TLS_CERTDIR: "/certs" + DOCKER_TLS_VERIFY: 1 + DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client" + STAGE: jet + MCORE_BACKWARDS_REF: core_r0.14.0 + KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi + SHARED_PATH: /builds/$CI_PROJECT_PATH/shared + script: + - apk add skopeo + - | + set -x + + env + eval "IMAGE=\$$IMAGE" + + docker manifest create ${IMAGE}:${CI_PIPELINE_ID} \ + ${IMAGE}:${CI_PIPELINE_ID}-amd64 \ + ${IMAGE}:${CI_PIPELINE_ID}-arm64 + + docker manifest push ${IMAGE}:${CI_PIPELINE_ID} + + if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then + skopeo copy --all docker://${IMAGE}:${CI_PIPELINE_ID} docker://${IMAGE}:${CI_COMMIT_BRANCH} + fi + + - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env + - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env + - cat build.env + retry: + max: 2 + artifacts: + reports: + dotenv: build.env diff --git a/.gitlab/stages/02.test.yml b/.gitlab/stages/02.test.yml new file mode 100644 index 00000000000..a324ce037fb --- /dev/null +++ b/.gitlab/stages/02.test.yml @@ -0,0 +1,318 @@ +.test_rules: + rules: + - if: $PUBLISH == "yes" + when: never + - if: $BUILD == "no" + when: never + - when: on_success + stage: test + +include: + - template: Security/Secret-Detection.gitlab-ci.yml + +wait_for_resources: + extends: [.test_rules] + needs: + - job: test:linting_secret_detection + optional: true + - test:build_image + image: python:3.10 + timeout: 7 days + variables: + KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi + KUBERNETES_SERVICE_CPU_REQUEST: 8 + KUBERNETES_SERVICE_CPU_LIMIT: 12 + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - pip install --no-cache-dir python-gitlab click + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - export NUM_CONCURRENT_JOBS + - python tests/test_utils/python_scripts/wait_for_resources.py --pipeline-id $CI_PIPELINE_ID --target-branch $CI_MERGE_REQUEST_TARGET_BRANCH_NAME + rules: + - if: $CI_MERGE_REQUEST_LABELS =~ /fast-track/ + when: never + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + when: on_success + - when: never + +test:unit_tests_configure: + extends: [.test_rules] + needs: + - test:build_image + - job: wait_for_resources + optional: true + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + before_script: + - git rm -r tests/test_utils/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes + - ls tests/test_utils/local_recipes + script: + - env + - set -x + - | + A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) + H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + - | + ARGS=( + "--scope unit-tests" + "--n-repeat ${UNIT_TEST_REPEAT}" + "--time-limit $(( UNIT_TEST_TIMEOUT * 60 ))" + "--test-cases all" + "--cluster $H100_CLUSTER" + "--platform dgx_h100" + "--partition batch" + "--container-image ${UTILITY_IMAGE}" + "--container-tag ${CI_PIPELINE_ID}" + "--dependent-job test:unit_tests_configure" + "--slurm-account ${CI_SLURM_ACCOUNT}" + "--no-enable-warmup" + ) + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment "lts" \ + --tag "legacy" \ + --output-path "unit-test-job-lts-legacy.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment "lts" \ + --tag "latest" \ + --output-path "unit-test-job-lts-latest.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment "dev" \ + --tag "legacy" \ + --output-path "unit-test-job-dev-legacy.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment "dev" \ + --tag "latest" \ + --output-path "unit-test-job-dev-latest.yaml" + rules: + - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" + allow_failure: true + when: on_success + - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' + when: on_success + artifacts: + paths: + - unit-test-job-dev-legacy.yaml + - unit-test-job-dev-latest.yaml + - unit-test-job-lts-legacy.yaml + - unit-test-job-lts-latest.yaml + - tests/test_utils/local_recipes + +.unit_tests_run: + needs: + - job: test:linting_formatting + optional: true + - job: test:linting_copyright + optional: true + - job: test:linting_secret_detection + optional: true + - test:unit_tests_configure + - test:build_image + extends: [.test_rules] + trigger: + include: + - artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml + job: test:unit_tests_configure + strategy: depend + variables: + RO_API_TOKEN: $PAT + CONTAINER_TAG: $CI_PIPELINE_ID + CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE + GITLAB_ENDPOINT: $GITLAB_ENDPOINT + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + MCORE_MR_COMMIT: $MCORE_MR_COMMIT + MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT + + inherit: + variables: true + rules: + - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" + allow_failure: true + when: on_success + - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' + when: on_success + +test:unit_tests_pyt(DEV)_mcore(latest): + extends: [.unit_tests_run] + variables: + ENVIRONMENT: dev + TAG: latest + +test:unit_tests_pyt(LTS)_mcore(latest): + extends: [.unit_tests_run] + variables: + ENVIRONMENT: lts + TAG: latest + +test:unit_tests_notify: + extends: [.test_rules] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + needs: + - test:unit_tests_pyt(DEV)_mcore(latest) + - test:unit_tests_pyt(LTS)_mcore(latest) + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - | + if [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} + else + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + fi + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0") + - export TEAM_SLUG=$SLACK_ADMIN + - | + python tests/test_utils/python_scripts/notify.py \ + --pipeline-id "${CI_PIPELINE_ID}" \ + --check-for unit-tests \ + --pipeline-context "unit-tests-extended" \ + --pipeline-created-at "${CI_PIPELINE_CREATED_AT}" + artifacts: + when: always + paths: + - scripts + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == "ci-unit-test-extended" || $CI_COMMIT_BRANCH == "ci-dev-unit-test-extended") + when: always + - when: never + +# Override from template +secret_detection: + rules: + - when: never + +# Inherit and modify template +test:linting_secret_detection: + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + extends: [".secret-analyzer"] + needs: [test:build_image] + variables: + GIT_DEPTH: 0 + SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA} + allow_failure: false + rules: + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + - when: never + script: + - apk add jq + - /analyzer run + - | + if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then + echo "Atleast one vulnerability has been found" + cat gl-secret-detection-report.json | jq '.' + exit 1 + fi + +test:unit_tests_x_coverage_report: + extends: [.test_rules] + needs: + - job: test:unit_tests_pyt(DEV)_mcore(latest) + - job: test:unit_tests_pyt(LTS)_mcore(latest) + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - python tests/test_utils/python_scripts/download_coverage_results.py --pipeline-id ${CI_PIPELINE_ID} + - coverage combine --keep $(ls coverage_results/*/coverage_report) + - coverage report + - coverage xml + coverage: "/TOTAL.+ ([0-9]{1,3}%)/" + artifacts: + reports: + coverage_report: + coverage_format: cobertura + path: coverage.xml + rules: + - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" + allow_failure: true + when: on_success + - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' + when: on_success + +test:safe_imports: + extends: [.test_rules] + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/builder-large + - team/megatron + services: + - name: docker:24.0.5-dind + variables: + HEALTHCHECK_TCP_PORT: "2376" + variables: + KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi + KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi + KUBERNETES_SERVICE_CPU_REQUEST: 8 + KUBERNETES_SERVICE_CPU_LIMIT: 12 + image: + name: python:3.11 + entrypoint: [""] + needs: [test:build_image] + script: + - env + - python -m ensurepip --upgrade + - python -m pip install --no-cache-dir -e . + - python -m pip install --no-cache-dir click + - python .gitlab/scripts/check_imports.py --package-name megatron.core + rules: + - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'dev' + when: never + - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" + allow_failure: true + when: on_success + - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' + when: on_success + retry: + max: 2 diff --git a/.gitlab/stages/03.integration-tests.yml b/.gitlab/stages/03.integration-tests.yml new file mode 100644 index 00000000000..70fa345e513 --- /dev/null +++ b/.gitlab/stages/03.integration-tests.yml @@ -0,0 +1,179 @@ +.integration_tests_rules: + stage: integration_tests + rules: + - if: $BUILD == "no" + when: never + - if: $INTEGRATION_TEST == "yes" + when: on_success + - when: never + +default: + id_tokens: + VAULT_JWT_TOKEN: + aud: https://stg.vault.nvidia.com + +include: + - project: dl/jet/gitlab-templates + ref: main + file: downstreams.yml + +integration:configure: + needs: + - test:build_image + - job: test:unit_tests_pyt(DEV)_mcore(latest) + optional: true + - job: test:unit_tests_pyt(LTS)_mcore(latest) + optional: true + - job: test:build_nemo_image + extends: [.integration_tests_rules] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + before_script: + - git rm -r tests/test_utils/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes + - ls tests/test_utils/local_recipes + script: + - set -x + - | + A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) + H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER) + - | + ARGS=( + "--scope $INTEGRATION_TEST_SCOPE" + "--n-repeat 1" + "--time-limit $INTEGRATION_TEST_TIME_LIMIT" + "--test-cases $INTEGRATION_TEST_CASES" + "--container-image ${UTILITY_IMAGE}" + "--container-tag ${CI_PIPELINE_ID}" + "--slurm-account ${CI_SLURM_ACCOUNT}" + "--no-enable-warmup" + "--dependent-job integration:configure" + "--enable-lightweight-mode" + ) + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_a100 \ + --cluster $A100_CLUSTER \ + --output-path "functional-test-job-dev-A100.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_h100 \ + --cluster $H100_CLUSTER \ + --output-path "functional-test-job-dev-H100.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_a100 \ + --cluster $A100_CLUSTER \ + --output-path "functional-test-job-lts-A100.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_h100 \ + --cluster $H100_CLUSTER \ + --output-path "functional-test-job-lts-H100.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb2100 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" + artifacts: + paths: + - functional-test-job-lts-A100.yaml + - functional-test-job-lts-H100.yaml + - functional-test-job-dev-H100.yaml + - functional-test-job-dev-A100.yaml + - functional-test-job-lts-GB200.yaml + - functional-test-job-dev-GB200.yaml + - tests/test_utils/local_recipes + +.integration_run: + needs: + - integration:configure + - test:build_image + - job: wait_for_resources + optional: true + extends: [.integration_tests_rules] + trigger: + include: + - artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml + job: integration:configure + strategy: depend + variables: + RO_API_TOKEN: $PAT + CONTAINER_TAG: $CI_PIPELINE_ID + CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE + GITLAB_ENDPOINT: $GITLAB_ENDPOINT + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT + MCORE_MR_COMMIT: $MCORE_MR_COMMIT + MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT + inherit: + variables: true + +integration:run_lts_dgx_a100: + extends: [.integration_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: A100 + +integration:run_lts_dgx_h100: + extends: [.integration_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: H100 + +integration:run_lts_dgx_gb200: + extends: [.integration_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: GB200 + +integration:run_dev_dgx_a100: + extends: [.integration_run] + variables: + ENVIRONMENT: dev + CLUSTER: A100 + +integration:run_dev_dgx_h100: + extends: [.integration_run] + variables: + ENVIRONMENT: dev + CLUSTER: H100 + +integration:run_dev_dgx_gb200: + extends: [.integration_run] + variables: + ENVIRONMENT: dev + CLUSTER: GB200 diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml new file mode 100644 index 00000000000..a12b1c87213 --- /dev/null +++ b/.gitlab/stages/04.functional-tests.yml @@ -0,0 +1,308 @@ +.functional_tests_rules: + stage: functional_tests + rules: + - if: $BUILD == "no" + when: never + - if: $FUNCTIONAL_TEST == "yes" + when: on_success + - when: never +default: + id_tokens: + VAULT_JWT_TOKEN: + aud: https://stg.vault.nvidia.com + +include: + - project: dl/jet/gitlab-templates + ref: main + file: downstreams.yml + +functional:configure: + needs: + - test:build_image + - test:build_nemo_image + - job: test:unit_tests_pyt(DEV)_mcore(latest) + optional: true + - job: test:unit_tests_pyt(LTS)_mcore(latest) + optional: true + - job: integration:run_lts_dgx_a100 + optional: true + - job: integration:run_dev_dgx_a100 + optional: true + - job: integration:run_lts_dgx_h100 + optional: true + - job: integration:run_dev_dgx_h100 + optional: true + extends: [.functional_tests_rules] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + before_script: + - git rm -r tests/test_utils/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes + - ls tests/test_utils/local_recipes + script: + - set -x + - | + A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) + H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + GB200_CLUSTER=$([[ "$CLUSTER_GB200" != "" ]] && echo $CLUSTER_GB200 || echo $DEFAULT_GB200_CLUSTER) + - | + RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false") + - | + if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "weekly" ]]; then + FUNCTIONAL_TEST_NAME=$(eval echo $FUNCTIONAL_TEST_NAME) + RELEASE_ARGS=( + "--run-name" + $FUNCTIONAL_TEST_NAME + "--wandb-experiment" + $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-') + ) + else + RELEASE_ARGS=() + fi + - | + # NOTE: $FUNCTIONAL_TEST_SCOPE is supplied by external scheduled-pipeline + # configurations and uses the GitLab-side legacy values: `mr`, `mr-slim`, + # `nightly`, `weekly`, `unit-tests`, `release`. + # - `mr` / `mr-slim` / `unit-tests` / `release` pass through verbatim + # and match recipe rows tagged with the same string. They are + # intentionally NOT aliased (see LEGACY_SCOPE_ALIASES in + # tests/test_utils/python_scripts/recipe_parser.py) so the GitLab + # and GitHub matrices stay decoupled. + # - `nightly` / `weekly` are aliased to L2 / L3 by the parser and also + # inject a default cadence when the recipe has no explicit one. + ARGS=( + "--scope $FUNCTIONAL_TEST_SCOPE" + "--n-repeat $FUNCTIONAL_TEST_REPEAT" + "--time-limit $FUNCTIONAL_TEST_TIME_LIMIT" + "--test-cases $FUNCTIONAL_TEST_CASES" + "--container-image ${UTILITY_IMAGE}" + "--container-tag ${CI_PIPELINE_ID}" + "--dependent-job functional:configure" + "--record-checkpoints ${RECORD_CHECKPOINTS}" + "--slurm-account ${CI_SLURM_ACCOUNT}" + "--no-enable-warmup" + ) + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_a100 \ + --cluster $A100_CLUSTER \ + --output-path "functional-test-job-dev-A100.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_h100 \ + --cluster $H100_CLUSTER \ + --output-path "functional-test-job-dev-H100.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_a100 \ + --cluster $A100_CLUSTER \ + --output-path "functional-test-job-lts-A100.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_h100 \ + --cluster $H100_CLUSTER \ + --output-path "functional-test-job-lts-H100.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment dev \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-dev-GB200.yaml" \ + ${RELEASE_ARGS[@]} + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ + ${ARGS[@]} \ + --environment lts \ + --platform dgx_gb200 \ + --cluster $GB200_CLUSTER \ + --output-path "functional-test-job-lts-GB200.yaml" \ + ${RELEASE_ARGS[@]} + artifacts: + paths: + - functional-test-job-lts-A100.yaml + - functional-test-job-lts-H100.yaml + - functional-test-job-dev-A100.yaml + - functional-test-job-dev-H100.yaml + - functional-test-job-lts-GB200.yaml + - functional-test-job-dev-GB200.yaml + - tests/test_utils/local_recipes + +.functional_run: + needs: + - functional:configure + - test:build_image + extends: [.functional_tests_rules] + trigger: + include: + - artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml + job: functional:configure + strategy: depend + variables: + RO_API_TOKEN: $PAT + CONTAINER_TAG: $CI_PIPELINE_ID + CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE + GITLAB_ENDPOINT: $GITLAB_ENDPOINT + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT + MCORE_MR_COMMIT: $MCORE_MR_COMMIT + MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT + CLUSTER: $CLUSTER + + inherit: + variables: true + +functional:run_lts_dgx_a100: + extends: [.functional_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: A100 + +functional:run_lts_dgx_h100: + extends: [.functional_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: H100 + +functional:run_lts_dgx_gb200: + extends: [.functional_run] + allow_failure: true + variables: + ENVIRONMENT: lts + CLUSTER: GB200 + +functional:run_dev_dgx_a100: + extends: [.functional_run] + variables: + ENVIRONMENT: dev + CLUSTER: A100 + +functional:run_dev_dgx_h100: + extends: [.functional_run] + variables: + ENVIRONMENT: dev + CLUSTER: H100 + +functional:run_dev_dgx_gb200: + extends: [.functional_run] + variables: + ENVIRONMENT: dev + CLUSTER: GB200 + +functional:run_nemo: + extends: [.functional_tests_rules] + trigger: + project: "dl/joc/nemo-ci" + branch: main-mirror + strategy: depend + inherit: + variables: true + variables: + MCORE_COMMIT: $CI_COMMIT_SHA + TEST_NEMO2_MODULE: "True" + ALLOW_FAILURE_DEPENDENCY: "True" + TESTS_TO_RUN_ON_THIS_COMMIT: nightly + rules: + - if: $FUNCTIONAL_TEST == "yes" + when: manual + allow_failure: true + - when: never + +functional:x_notify: + extends: [.functional_tests_rules] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + needs: + - functional:run_lts_dgx_a100 + - functional:run_dev_dgx_a100 + - functional:run_lts_dgx_h100 + - functional:run_dev_dgx_h100 + - functional:run_lts_dgx_gb200 + - functional:run_dev_dgx_gb200 + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + variables: + RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE} + CONTEXT: $FUNCTIONAL_TEST_SCOPE + script: + - env + - | + if [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK_DEV} + else + export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + fi + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - export CONTEXT=$FUNCTIONAL_TEST_SCOPE + - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0") + - export TEAM_SLUG=$SLACK_ADMIN + - | + python tests/test_utils/python_scripts/notify.py \ + --pipeline-id "${CI_PIPELINE_ID}" \ + --check-for functional-tests \ + --pipeline-context $CONTEXT \ + --pipeline-created-at "${CI_PIPELINE_CREATED_AT}" + + artifacts: + when: always + paths: + - scripts + rules: + - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes" + when: always + - when: never + +functional:x_download_golden_values: + extends: [.functional_tests_rules] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - python tests/test_utils/python_scripts/download_golden_values.py --pipeline-id ${CI_PIPELINE_ID} + artifacts: + paths: + - tests/ + rules: + - if: $FUNCTIONAL_TEST == "yes" + when: manual + allow_failure: true + - when: never diff --git a/.gitlab/stages/05.publish.yml b/.gitlab/stages/05.publish.yml new file mode 100644 index 00000000000..de30147bfd1 --- /dev/null +++ b/.gitlab/stages/05.publish.yml @@ -0,0 +1,186 @@ +.publish_common_release: + stage: publish + rules: + - if: $CI_PIPELINE_SOURCE == "web" && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release" + when: manual + - if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release" + when: on_success + - when: never + +publish:docs: + extends: [.publish_common_release] + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + before_script: + - eval PUBLISH_COMMIT=$PUBLISH_COMMIT + - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + - git fetch origin $PUBLISH_COMMIT + - git checkout $PUBLISH_COMMIT + script: + - cd .. + - rm -rf documentation && git clone --recursive https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git + - cd documentation/megatron-lm + - git config --global user.email "mcore-bot@nvidia.com" + - git config --global user.name "Mcore Bot" + - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + - git fetch origin $PUBLISH_COMMIT + - git checkout $PUBLISH_COMMIT + - cd .. + - git add megatron-lm + - | + git commit -m 'feat: Bump mcore' + + - git push + rules: + - if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"' + allow_failure: true + - when: never + +publish:upload_statistics: + stage: publish + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + needs: + - job: test:unit_tests_pyt(DEV)_mcore(latest) + - job: test:unit_tests_pyt(LTS)_mcore(latest) + - job: functional:run_lts_dgx_a100 + optional: true + - job: functional:run_lts_dgx_h100 + optional: true + - job: functional:run_dev_dgx_a100 + optional: true + - job: functional:run_dev_dgx_h100 + optional: true + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - env + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - export DASHBOARD_ENDPOINT + - python tests/test_utils/python_scripts/dashboard.py --pipeline-id ${CI_PIPELINE_ID} + rules: + - if: ($CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' || $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train') && ($UNIT_TEST == "yes" || $INTEGRATION_TEST == "yes" || $FUNCTIONAL_TEST == "yes") + when: always + allow_failure: true + - when: never + + +publish:approve_merge_gate: + stage: publish + image: maniator/gh + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + script: + - | + set -eoux pipefail + EXIT_CODE=0 + apk add python3 + python -m venv .venv + source .venv/bin/activate + pip install --no-cache-dir python-gitlab click pygithub + export GITLAB_ENDPOINT + export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + if [[ "$CI_COMMIT_BRANCH" == *main* ]]; then + export TARGET_BRANCH="main" + elif [[ "$CI_COMMIT_BRANCH" == *dev* ]]; then + export TARGET_BRANCH="dev" + fi + + python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$TARGET_BRANCH" --once || EXIT_CODE=$? + + export GH_TOKEN=$GH_TOKEN + export REPO=NVIDIA/Megatron-LM + + if [[ $EXIT_CODE -eq 0 ]]; then + export STATUS="approved" + export COMMENT="Main is healthy. Submitting PR." + elif [[ $EXIT_CODE -eq 1 ]]; then + export STATUS="rejected" + export COMMENT="$TARGET_BRANCH is not healthy. An automation engineer is investigating. No need to take any action." + elif [[ $EXIT_CODE -eq 2 ]]; then + echo "Main is running. We won't cancel the deployment." + exit 0 + fi + + if [[ $EXIT_CODE -lt 2 ]]; then + python tests/test_utils/python_scripts/approve_merge_gate.py + fi + + retry: + max: 2 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main') + when: always + - when: never + +publish:sync_branches: + stage: publish + image: python:3.10 + script: + - set -x + - git remote add github https://github.com/NVIDIA/Megatron-LM.git || true + - git remote add gitlab https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/${CI_PROJECT_NAMESPACE}/Megatron-LM.git || true + - BRANCHES=("main" "dev") + - | + while IFS= read -r line; do + BRANCHES+=("$line") # Add each line to the array + done < <( \ + git ls-remote --heads "https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git" 'refs/heads/core_*' | \ + cut -d'/' -f3- \ + ) + - | + for BRANCH in "${BRANCHES[@]}"; do + # Define the full refspec for the branch + BRANCH_REF="refs/heads/$BRANCH" + + echo "--- Processing branch: $BRANCH ---" + + # 1. Explicitly fetch the branch ref from 'github' + # This avoids fetching a tag with the same name. + # It updates/creates the remote-tracking branch (e.g., 'refs/remotes/github/core_r0.10.0') + if ! git fetch github "$BRANCH_REF:refs/remotes/github/$BRANCH"; then + echo "Failed to fetch branch $BRANCH. Skipping." + continue + fi + + # 2. Create or update the local branch from the remote-tracking branch we just fetched. + # The -B flag creates the branch if it doesn't exist or resets it if it does. + if ! git checkout -B "$BRANCH" "github/$BRANCH"; then + echo "Failed to checkout local branch $BRANCH. Skipping." + continue + fi + + # 3. Now you are on the correct local branch, ready to push. + echo "Successfully on branch $BRANCH. Echoing push command:" + git push -u gitlab HEAD:refs/heads/$BRANCH --force + echo "-----------------------------------" + done + tags: + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/utility + - team/megatron + retry: + max: 2 + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-sync-branches') + when: always + - when: never diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000000..851efa0e303 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: +- repo: https://github.com/psf/black + rev: 'refs/tags/24.4.2:refs/tags/24.4.2' + hooks: + - id: black + files: ^megatron/core/.*|^tests/unit_tests/.* + args: ["--skip-magic-trailing-comma", "--skip-string-normalization"] +- repo: https://github.com/pycqa/pylint + rev: v3.2.6 + hooks: + - id: pylint + files: ^megatron/core/.* +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + files: ^megatron/core/.* \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000000..e2c1e0f36fe --- /dev/null +++ b/.pylintrc @@ -0,0 +1,21 @@ +[MAIN] +ignore-paths=tests +max-line-length=100 +load-plugins=pylint.extensions.bad_builtin +[MESSAGES CONTROL] +disable=all + +enable=C0115,C0116,W0611,C0301,E0606,W0141 +# C0115: missing-class-docstring +# C0116: missing-function-docstring +# W0611: unused-import +# C0301: line-too-long +# E0606: possibly-used-before-assignment +# W0141: bad-builtin (from bad_builtin extension) + +[BASIC] +bad-functions=print + +[BAD_BUILTIN] +# Specify which builtins should be flagged +bad-builtins=print \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 00000000000..fdcfcfdfca8 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..70e8152cbf4 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,31 @@ +# Repository Guidelines + +## Skills + +The `skills/` directory contains structured guides for common tasks (running +tests, building containers, managing dependencies, submitting SLURM jobs, etc.). +**Always read the relevant `SKILL.md` before starting any task it covers — +skills are mandatory context, not optional background reading.** + +**Workflow — mandatory order for every task:** +1. **Pull information first.** Read the commit, PR, error log, file, or + whatever artifact the task is about. Do not reason about it yet. +2. **Select and invoke the skill.** Based on what you just read, identify + the relevant skill and invoke it before forming any answer or plan. +3. **Answer or implement.** Only after the skill is loaded, use its context + to reason, diagnose, or write code. + +Never skip or reorder these steps. Do not wait for the user to name the right +skill keyword — infer it from the artifact you read. + +## Contributing + +### Pull Requests + +- All PRs must be created as **drafts**. Use `gh pr create --draft` or the GitHub UI draft option. +- Never push branches directly to `https://github.com/NVIDIA/Megatron-LM`. You must push your branch to a personal fork (e.g. `https://github.com//Megatron-LM`), then open a PR from the fork's branch against `NVIDIA/Megatron-LM`. +- Read @docs/developer/contribute.md for the full contribution policy, including code style, commit message conventions, and issue guidelines. + +### Code Quality + +- After editing imports in any Python files, always run `uv run isort` on those files to fix import order before committing. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 00000000000..47dc3e3d863 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000000..6b128dce590 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,3 @@ +# Contributing to Megatron + +Visit our [contributing page](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html). \ No newline at end of file diff --git a/LICENSE b/LICENSE index 281fde95a67..4570d9283c4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,6 @@ The following applies to all files unless otherwise noted: -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# +# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: @@ -29,13 +28,15 @@ The following applies to all files unless otherwise noted: -- This repository also contains code from Hugging Face Inc., Google Research, -Facebook (from their Fairseq and Dino projects), Microsoft(from their -Swin-Transformer project)and Philip Popien. Files from these -organizations have notices at the top of each file. Below are -licenses used in those files, as indicated. +Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their +Swin-Transformer project), Philip Popien, the Mamba project (Tri Dao and +Albert Gu), and the Triton language and compiler project (Philippe Tillet and +OpenAI). Files from these organizations have notices at the top of each file. +Below are licenses used in those files, as indicated. -------------- LICENSE FOR Facebook, huggingface and Google Research code -------------- +-------------------------------------------------------------------------------------- +-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, TinyZero and vLLM code -- Apache License @@ -240,12 +241,17 @@ licenses used in those files, as indicated. See the License for the specific language governing permissions and limitations under the License. -------------- LICENSE FOR Facebook Fairseq code -------------- +-------------------------------------------------------------------------------- +LICENSE FOR +Facebook, Inc. and its affiliates, +Meta Platforms, Inc. and its affiliates, +Microsoft Corporation, +OpenGVLab/InternVL, +Triton language and compiler, +and DeepSeek. MIT License -Copyright (c) Facebook, Inc. and its affiliates. - Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -264,11 +270,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------- LICENSE FOR Mircrosoft Swin transformer code -------------- +-------------------------------------------------------------------------------- +LICENSE FOR Thinking Machines Lab MIT License -Copyright (c) Microsoft Corporation. +Copyright 2025 Thinking Machines Lab Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -286,91 +293,35 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE - - ---------------- NVIDIA Source Code License for SegFormer ----------------- -1. Definitions - -“Licensor” means any person or entity that distributes its Work. - -“Software” means the original work of authorship made available under this -License. - -“Work” means the Software and any additions to or derivative works of the -Software that are made available under this License. - -The terms “reproduce,” “reproduction,” “derivative works,” and -“distribution” have the meaning as provided under U.S. copyright law; -provided, however, that for the purposes of this License, derivative works -shall not include works that remain separable from, or merely link -(or bind by name) to the interfaces of, the Work. - -Works, including the Software, are “made available” under this License by -including in or with the Work either (a) a copyright notice referencing -the applicability of this License to the Work, or (b) a copy of this License. - -2. License Grant - -2.1 Copyright Grant. Subject to the terms and conditions of this License, -each Licensor grants to you a perpetual, worldwide, non-exclusive, -royalty-free, copyright license to reproduce, prepare derivative works of, -publicly display, publicly perform, sublicense and distribute its Work -and any resulting derivative works in any form. - -3. Limitations - -3.1 Redistribution. You may reproduce or distribute the Work only if -(a) you do so under this License, (b) you include a complete copy of this -License with your distribution, and (c) you retain without modification any -copyright, patent, trademark, or attribution notices that are present -in the Work. - -3.2 Derivative Works. You may specify that additional or different terms -apply to the use, reproduction, and distribution of your derivative works -of the Work (“Your Terms”) only if (a) Your Terms provide that the use -limitation in Section 3.3 applies to your derivative works, and (b) you -identify the specific derivative works that are subject to Your Terms. -Notwithstanding Your Terms, this License (including the redistribution -requirements in Section 3.1) will continue to apply to the Work itself. - -3.3 Use Limitation. The Work and any derivative works thereof only may -be used or intended for use non-commercially. Notwithstanding the -foregoing, NVIDIA and its affiliates may use the Work and any derivative -works commercially. As used herein, “non-commercially” means for research -or evaluation purposes only. - -3.4 Patent Claims. If you bring or threaten to bring a patent claim against -any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) -to enforce any patents that you allege are infringed by any Work, then -your rights under this License from such Licensor (including the grant -in Section 2.1) will terminate immediately. - -3.5 Trademarks. This License does not grant any rights to use any Licensor’s -or its affiliates’ names, logos, or trademarks, except as necessary to -reproduce the notices described in this License. +SOFTWARE. -3.6 Termination. If you violate any term of this License, then your rights -under this License (including the grant in Section 2.1) will terminate -immediately. +-------------------------------------------------------------------------------- +LICENSE FOR +Meta Platforms, Inc. and affiliates. -4. Disclaimer of Warranty. +BSD License -THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. -YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: -5. Limitation of Liability. + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. -EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL -THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE -SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, -INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT -OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK -(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, -LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER -COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN -ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name Meta nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000000..41bc48bfd53 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include megatron/core/requirements.txt +include megatron/core/README.md +include megatron/core/package_info.py +global-exclude LICENSE +recursive-include requirements * diff --git a/README.md b/README.md index 6bb334e8e15..9a62f9bb750 100644 --- a/README.md +++ b/README.md @@ -1,515 +1,162 @@ -Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision. - -Below are some of the projects where we have directly used Megatron: -* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf) -* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf) -* [End-to-End Training of Neural Retrievers for Open-Domain Question Answering](https://arxiv.org/abs/2101.00408) -* [Large Scale Multi-Actor Generative Dialog Modeling](https://www.aclweb.org/anthology/2020.acl-main.8.pdf) -* [Local Knowledge Powered Conversational Agents](https://arxiv.org/abs/2010.10150) -* [MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models](https://www.aclweb.org/anthology/2020.emnlp-main.226.pdf) -* [RACE Reading Comprehension Dataset Leaderboard](http://www.qizhexie.com/data/RACE_leaderboard.html) -* [Training Question Answering Models From Synthetic Data](https://www.aclweb.org/anthology/2020.emnlp-main.468.pdf) -* [Few-shot Instruction Prompts for Pretrained Language Models to Detect Social Biases](https://arxiv.org/abs/2112.07868) -* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173) -* [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https://arxiv.org/abs/2201.11990) -* [Multi-Stage Prompting for Knowledgeable Dialogue Generation](https://arxiv.org/abs/2203.08745) - -Megatron is also used in [NeMo Megatron](https://developer.nvidia.com/nvidia-nemo#nemo-megatron), a framework to help enterprises overcome the challenges of building and training sophisticated natural language processing models with billions and trillions of parameters. - -Our codebase is capable of efficiently training very large (hundreds of billions of parameters) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs and model sizes, we consider GPT models from 1 billion all the way to 1 trillion parameters. All models use a vocabulary size of 51,200 and a sequence length of 2048. We vary hidden size, number of attention heads, and number of layers to arrive at a specifc model size. As the model size increases, we also modestly increase the batch size. We leverage [NVIDIA's Selene supercomputer](https://www.top500.org/system/179842/) to perform scaling studies and use up to 3072 [A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for the largest model. Each cluster node has 8 NVIDIA 80GB A100 GPUs. The graph below shows that we scale nearly linear up to 1 trillion parameter models running on 3072 GPUs. Note that these results are from benchmark runs and these models were not trained to convergence; however, the FLOPs are measured for end-to-end training, i.e., includes all operations including data loading, optimization, and even logging. - -![Scaling Graph](images/Achieved_petaFLOPs.png) - -The following table shows both model (MFU) and hardware (HFU) FLOPs utilization for select configurations up to 1T parameters (see [our paper](https://arxiv.org/pdf/2205.05198) for a description of how these are calculated). As the model size increases, we achieve better GPU utilization and for the one trillion parameter model, we reach a MFU and HFU of 56.3% and 57.0%, respectively. Note that these numbers are also measured on benchmark runs and in this case are measured using a data parallel size of one. Data parallelism introduces some overhead due to the gradient all-reduce required between the data parallel groups. However, for large transformer models, this overhead is not large and can almost entirely eliminted by overlapping the gradient all-reduce with backpropagation. - -| Model Size | Model FLOPs Utilization | Hardware FLOPs Utilization | -| :---: | :---: | :---: | -| 22B | 41.5% | 43.7% | -| 175B | 51.4% | 52.8% | -| 530B | 56.0% | 57.0% | -| 1T | 56.3% | 57.0% | - -# Contents - * [Contents](#contents) - * [Setup](#setup) - * [Downloading Checkpoints](#downloading-checkpoints) - * [Usage](#usage) - * [Training](#training) - * [Data Preprocessing](#data-preprocessing) - * [BERT Pretraining](#bert-pretraining) - * [GPT Pretraining](#gpt-pretraining) - * [T5 Pretraining](#t5-pretraining) - * [Distributed Pretraining](#distributed-pretraining) - * [Activation Checkpointing and Recomputation](#activation-checkpointing-and-recomputation) - * [Distributed Optimizer](#distributed-optimizer) - * [FlashAttention](#flashattention) - * [GPT-3 Example](#gpt-3-example) - * [Retro](#retro) - * [Evaluation and Tasks](#evaluation-and-tasks) - * [GPT Text Generation](#gpt-text-generation) - * [GPT Evaluation](#gpt-evaluation) - * [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation) - * [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy) - * [BERT Task Evaluation](#bert-task-evaluation) - * [RACE Evaluation](#race-evaluation) - * [MNLI Evaluation](#mnli-evaluation) - * [Datasets](#datasets) - * [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data) - * [Collecting GPT Webtext Data](#collecting-gpt-webtext-data) - -# Setup -We strongly recommend using the latest release of [NGC's PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) with DGX nodes. If you can't use this for some reason, use the latest pytorch, cuda, nccl, and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start) releases. Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation, or downstream tasks. - -You can launch an instance of the PyTorch container and mount Megatron, your dataset, and checkpoints with the following Docker commands: -``` -docker pull nvcr.io/nvidia/pytorch:xx.xx-py3 -docker run --gpus all -it --rm -v /path/to/megatron:/workspace/megatron -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints nvcr.io/nvidia/pytorch:xx.xx-py3 -``` - -## Downloading Checkpoints -We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) and [GPT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints for use to evaluate or finetuning downstream tasks. To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1). - -Alternatively, you can directly download the checkpoints using: - -
-BERT-345M-uncased: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O megatron_bert_345m_v0.1_uncased.zip
-BERT-345M-cased: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O megatron_bert_345m_v0.1_cased.zip
-GPT-345M: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
-
- -The models require vocabulary files to run. The BERT WordPiece vocab file can be extracted from Google's pretrained BERT models: [uncased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt), [cased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt). The GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) can be downloaded directly. - -# Usage +
-After installation, there are several possible workflows. The most comprehensive is: -1. Data preprocessing -2. Pretraining -3. Finetuning (Optional for zero-shot tasks) -4. Downstream task evaluation or text generation +Megatron-LM and Megatron Core +============================= -However, steps 1 and 2 can be replaced by using one of the pretrained models mentioned above. +

GPU-optimized library for training transformer models at scale

-We've provided several scripts for pretraining both BERT and GPT in [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT interactive text generation. +[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) +[![version](https://img.shields.io/badge/release-0.15.0-green)](./CHANGELOG.md) +[![license](https://img.shields.io/badge/license-Apache-blue)](./LICENSE) -# Training -## Data Preprocessing -The training data requires preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example: -
-{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
-{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
-
+
-The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training. +## About -The loose json is then processed into a binary format for training. To convert the json into mmap, cached index file, or the lazy loader format use `preprocess_data.py`. Set the `--dataset-impl` flag to `mmap`, `cached`, or `lazy`, respectively (default is `mmap`). An example script to prepare data for BERT training is: -
-python tools/preprocess_data.py \
-       --input my-corpus.json \
-       --output-prefix my-bert \
-       --vocab bert-vocab.txt \
-       --dataset-impl mmap \
-       --tokenizer-type BertWordPieceLowerCase \
-       --split-sentences
-
+This repository contains two components: **Megatron-LM** and **Megatron Core**. -The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension. +**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation. -For T5 use the same preprocessing as BERT, perhaps renaming it to: -
-       --output-prefix my-t5 \
-
+**Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines. -Some minor modifications are required for GPT data preprocessing, namely, the addition of a merge table, an end-of-document token, removal of sentence splitting, and a change to the tokenizer type: -
-python tools/preprocess_data.py \
-       --input my-corpus.json \
-       --output-prefix my-gpt2 \
-       --vocab gpt2-vocab.json \
-       --dataset-impl mmap \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file gpt2-merges.txt \
-       --append-eod
-
+**[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** provides bidirectional Hugging Face ↔ Megatron checkpoint conversion with production-ready recipes. -Here the output files are named `my-gpt2_text_document.bin` and `my-gpt2_text_document.idx`. As before, in GPT training, use the longer name without the extension as `--data-path`. +## Getting Started -Further command line arguments are described in the source file [`preprocess_data.py`](./tools/preprocess_data.py). +**Install from PyPI:** -## BERT Pretraining - - -The [`examples/pretrain_bert.sh`](./examples/pretrain_bert.sh) script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` which is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`. +```bash +uv pip install megatron-core +``` -The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions. +**Or clone and install from source:** -Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py). +```bash +git clone https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM +uv pip install -e . +``` -To run `examples/pretrain_bert.sh`, make any desired modifications including setting the environment variables for `CHECKPOINT_PATH`, `VOCAB_FILE`, and `DATA_PATH`. Make sure to set these variables to their paths in the container. Then launch the container with Megatron and necessary paths mounted (as explained in [Setup](#setup)) and run the example script. +> **Note:** Building from source can use a lot of memory. If the build runs out of memory, limit parallel compilation jobs by setting `MAX_JOBS` (e.g. `MAX_JOBS=4 uv pip install -e .`). -## GPT Pretraining +For NGC container setup and all installation options, see the **[Installation Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/get-started/install.html)**. -The `examples/pretrain_gpt.sh` script runs single GPU 345M parameter GPT pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training. +- **[Your First Training Run](https://docs.nvidia.com/megatron-core/developer-guide/latest/get-started/quickstart.html)** - End-to-end training examples with data preparation +- **[Parallelism Strategies](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/parallelism-guide.html)** - Scale training across GPUs with TP, PP, DP, EP, and CP +- **[Contribution Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html)** - How to contribute to Megatron Core -It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay. Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions. +# Latest News -Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py). +- **[2026/03]** **Deprecating Python 3.10 support:** We're officially dropping Python 3.10 support with the upcoming 0.17.0 release. Downstream applications must raise their lower boundary to 3.12 to stay compatible with MCore. +- **[2026/01]** **[Dynamic Context Parallelism](https://developer.nvidia.com/blog/speeding-up-variable-length-training-with-dynamic-context-parallelism-and-nvidia-megatron-core/)** - Up to 1.48x speedup for variable-length sequence training with adaptive CP sizing. +- **[2025/12]** **Megatron Core development has moved to GitHub!** All development and CI now happens in the open. We welcome community contributions. +- **[2025/10]** **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features. +- **[2025/10]** **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models. +- **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements. +- **[2025/08]** **[GPT-OSS Model](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core. +- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools. +- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)). -`examples/pretrain_gpt.sh` can be launched the same way as described for BERT. Set the env vars and make any other modifications, launch the container with appropriate mounts, and run the script. +
+Previous News -## T5 Pretraining +- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)). +- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba). +- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. -Very similar to BERT and GPT, the `examples/pretrain_t5.sh` script runs single GPU "base" (~220M parameter) T5 pretraining. The primary difference from BERT and GPT is the addition of the following arguments to accommodate the T5 architecture: +
-* `--kv-channels` sets the inner dimension of the "key" and "value" matrices of all attention mechanisms in the model. For BERT and GPT this defaults to the hidden size divided by the number of attention heads, but can be configured for T5. +# Project Structure -* `--ffn-hidden-size` sets the hidden size in the feed-forward networks within a transformer layer. For BERT and GPT this defaults to 4 times the transformer hidden size, but can be configured for T5. +``` +Megatron-LM/ +├── megatron/ +│ ├── core/ # Megatron Core (kernels, parallelism, building blocks) +│ │ ├── models/ # Transformer models +│ │ ├── transformer/ # Transformer building blocks +│ │ ├── tensor_parallel/ # Tensor parallelism +│ │ ├── pipeline_parallel/ # Pipeline parallelism +│ │ ├── distributed/ # Distributed training (FSDP, DDP) +│ │ ├── optimizer/ # Optimizers +│ │ ├── datasets/ # Dataset loaders +│ │ ├── inference/ # Inference engines and server +│ │ └── export/ # Model export (e.g. TensorRT-LLM) +│ ├── training/ # Training scripts +│ ├── legacy/ # Legacy components +│ ├── post_training/ # Post-training (quantization, distillation, pruning, etc.) +│ └── rl/ # Reinforcement learning (RLHF, etc.) +├── examples/ # Ready-to-use training examples +├── tools/ # Utility tools +├── tests/ # Comprehensive test suite +└── docs/ # Documentation +``` -* `--encoder-seq-length` and `--decoder-seq-length` set the sequence length for the encoder and decoder separately. +# Performance Benchmarking -All of the other arguments remain as they were for BERT and GPT pretraining. Run this example with the same steps described above for the other scripts. +For our latest performance benchmarking results, please refer to [NVIDIA Megatron Bridge Performance Summary](https://docs.nvidia.com/nemo/megatron-bridge/latest/performance-summary.html). -## Distributed Pretraining +Our codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters. -The `examples/pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch distributed launcher for distributed training. As such, multi-node training can be achieved by properly setting environment variables. See the official PyTorch [documentation](https://pytorch.org/docs/stable/elastic/run.html#launcher-api) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multi-node training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the `torchrun` elastic launcher (equivalent to `python -m torch.distributed.run`) are the only additional requirements to adopt distributed training. See any of `examples/pretrain_{bert,gpt,t5}_distributed.sh` for more details. +![Model table](images/model_table.png) -We use two types of parallelism: data and model parallelism. We facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time. +**Benchmark Configuration:** -Second, we developed a simple and efficient two-dimensional model-parallel approach. To use tensor model parallelism (splitting execution of a single transformer module over multiple GPUs, see Section 3 of [our paper](https://arxiv.org/pdf/1909.08053.pdf)), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use sequence parallelism specify `--sequence-parallel`, which requires tensor model parallel as it split among the same GPUs (more details in Section 4.2.2 of [our paper](https://arxiv.org/pdf/2205.05198.pdf)). +- **Vocabulary size**: 131,072 tokens +- **Sequence length**: 4096 tokens +- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts +- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default) -To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches, see Section 2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each). +**Key Results:** - +- **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training +- **Superlinear scaling**: MFU increases from 41% to 47-48% with model size +- **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging) +- **Production ready**: Full training pipeline with checkpointing and fault tolerance +- *Note: Performance results measured without training to convergence* -We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`: +## Weak Scaling Results -Other than these minor changes, the distributed training is identical to the training on a single GPU. +Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute. -The interleaved pipelining schedule (more details in Section 2.2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)) can be enabled using the `--num-layers-per-virtual-pipeline-stage` argument, which controls the number of transformer layers in a virtual stage (by default with the non-interleaved schedule, each GPU will execute a single virtual stage with `NUM_LAYERS / PIPELINE_MP_SIZE` transformer layers). The total number of layers in the transformer model should be divisible by this argument value. Additionally, the number of microbatches in the pipeline (computed as `GLOBAL_BATCH_SIZE / (DATA_PARALLEL_SIZE * MICRO_BATCH_SIZE)`) should be divisible by the `PIPELINE_MP_SIZE` when using this schedule (this condition is checked in an assertion in the code). The interleaved schedule is not supported for pipelines with 2 stages (`PIPELINE_MP_SIZE=2`). +![Weak scaling](images/weak_scaling.png) -## Activation Checkpointing and Recomputation +## Strong Scaling Results -To reduce GPU memory usage so deploy a large model to a training system, we support activation checkpointing and recomputation. We support two levels of recompute granularity: `selective` and `full`. Selective recomputation is the default and recommended in almost all cases. It saves the activations that take less space and are expensive to recompute and recomputes activations that take a lot of space but are relatively cheap to recompute (see [our paper](https://arxiv.org/pdf/2205.05198) for details). To enable selective activation recompute simply use `--recompute-activations`. +We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%. -For cases where memory is very tight, `full` checkpointing saves just the inputs to a transformer layer, or a block of transformer layers, and recomputes everything else. To turn on full activation recompute use `--recompute-granularity full`. When using full activation recomputation, there are two methods: `uniform` and `block`, chosen using the `--recompute-method` argument. +![Strong scaling](images/strong_scaling.png) -* Uniform method uniformly divides the Transformer layers into groups of layers and stores the input activations of each group in the memory. The baseline group size is 1 and, in this case, the input activation of each Transformer layer is checkpointed. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage thus enables running a bigger model. For example, when using the number of layers per group of 4, the input activation of each group of 4 Transformer layers is checkpointed. +# Roadmaps -* Block method checkpoints the input activations of a set number of individual Transformer layers per pipeline stage and do the rest of layers without any checkpointing. This method can be used to skip checkpointing some Transformer layers until the GPU memory is fully used, which is applicable only when there is unused GPU memory. Checkpointing fewer transformer layers avoids unnecessary activation recomputation in the backprop thus improves training performance. For example, when we specify 5 layers to checkpoint of 8 layers per pipeline stage, the input activations of only the first 5 Transformer layers are checkpointed and activation recomputation for the rest 3 layers is not needed in the backprop. +- **[MoE Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements +# Resources -## Distributed Optimizer +## Getting Help -Usage: `--use-distributed-optimizer`. Compatible with all model and data types. +- 📖 **[Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)** - Official documentation +- 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests -The distributed optimizer is a memory savings technique, whereby the optimizer state is evenly distributed across data parallel ranks (versus the traditional method of replicating the optimizer state across data parallel ranks). As described in [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054), our implementation distributes all optimizer state that does not overlap with the model state. For example, when using fp16 model params, the distributed optimizer maintains its own separate copy of fp32 main params & grads, which are distributed across DP ranks. When using bf16 model params, however, the distributed optimizer's fp32 main grads are the same as the model's fp32 grads, and so the grads in this case are not distributed (although the fp32 main params are still distributed, as they are separate from the bf16 model params). +## Contributing -Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In our implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size): +We ❤️ contributions! Ways to contribute: -| | Non-distributed optim | Distributed optim | -|-|-|-| -| fp16 param, fp16 grads | 20 | 4 + 16/d | -| bf16 param, fp32 grads | 18 | 6 + 12/d | -| fp32 param, fp32 grads | 16 | 8 + 8/d | +- 🐛 **Report bugs** - Help us improve reliability +- 💡 **Suggest features** - Shape the future of Megatron Core +- 📝 **Improve docs** - Make Megatron Core more accessible +- 🔧 **Submit PRs** - Contribute code improvements -## FlashAttention +**→ [Contributing Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html)** -Usage: `--use-flash-attn`. Support attention head dimensions at most 128. +## Citation -[FlashAttention](https://github.com/HazyResearch/flash-attention) is a fast and -memory-efficient algorithm to compute exact attention. It speeds up model -training and reduces memory requirement. +If you use Megatron in your research or project, we appreciate that you use the following citations: -To install FlashAttention: -```sh -pip install flash-attn +```bibtex +@article{megatron-lm, + title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, + author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, + journal={arXiv preprint arXiv:1909.08053}, + year={2019} +} ``` - -## GPT-3 Example - -In `examples/pretrain_gpt3_175B.sh` we have provided an example of how to configure Megatron to run [GPT-3](https://arxiv.org/abs/2005.14165) with 175 billion parameters on 1024 GPUs. The script is designed for [slurm](https://slurm.schedmd.com/documentation.html) with [pyxis](https://github.com/NVIDIA/pyxis) plugin but can be easily adopted to any other scheduler. It uses 8-way and 16-way tensor and pipeline parallelism, respectively. With options `global-batch-size 1536` and `rampup-batch-size 16 16 5859375`, the training will start with global batch size 16 and linearly increase the global batch size to 1536 over 5,859,375 samples with incrmeental steps 16. The training dataset can be either a single set or a multiple datasets combined with a set of weights. - -With full global batch size of 1536 on 1024 A100 GPUs, each iteration takes around 32 seconds resulting in 138 teraFLOPs per GPU which is 44% of the theoretical peak FLOPs. - - -## Retro - -See: - -- `tools/retro/README.md` for an overview. -- `tools/retro/examples/get_preprocess_cmd.sh` for an example of common preprocessing arguments. -- `tools/retro/examples/preprocess_data.sh` for an example of how to preprocess data. -- `tools/retro/examples/pretrain_model.sh` for an example of how to pretrain a model. - -Retro is a retrieval-enhanced model that is based on GPT. As described in [Improving language models by retrieving from trillions of tokens](https://arxiv.org/abs/2112.04426), Retro retrieves from a database of document chunks by performing locality search using a sample's tokens. The retrieval database can be large -- often billions or even trillions of tokens -- and provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters. - -Using Retro requires two steps: 1) preprocessing the retrieval database and pretraining neighbors, and 2) pretraining a model using this data. Please see `tools/retro/README.md` for a detailed overview. - - - -# Evaluation and Tasks - -We provide several command line arguments, detailed in the scripts listed below, to handle various zero-shot and fine-tuned downstream tasks. However, you can also finetune your model from a pretrained checkpoint on other corpora as desired. To do so, simply add the `--finetune` flag and adjust the input files and training parameters within the original training script. The iteration count will be reset to zero, and the optimizer and internal state will be reinitialized. If the fine-tuning is interrupted for any reason, be sure to remove the `--finetune` flag before continuing, otherwise the training will start again from the beginning. - -Because evaluation requires substantially less memory than training, it may be advantageous to merge a model trained in parallel for use on fewer GPUs in downstream tasks. The following script accomplishes this. This example reads in a GPT model with 4-way tensor and 4-way pipeline model parallelism and writes out a model with 2-way tensor and 2-way pipeline model parallelism. - -
-python tools/checkpoint_util.py \
-        --model-type GPT \
-        --load-dir checkpoints/gpt3_tp4_pp4 \
-        --save-dir checkpoints/gpt3_tp2_pp2 \
-        --target-tensor-parallel-size 2 \
-        --target-pipeline-parallel-size 2
-
-
- -Several downstream tasks are described for both GPT and BERT models below. They can be run in distributed and model parallel modes with the same changes used in the training scripts. - -## GPT Text Generation - -We have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`. You run it much like you would start a pretraining job, specifying an appropriate pretrained checkpoint. There are also few optional parameters: `temperature`, `top-k`and `top-p`. See `--help` or the source file for more information. See [examples/run_text_generation_server_345M.sh](examples/run_text_generation_server_345M.sh) for an example of how to run the server. - -Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on. - -
-tools/text_generation_cli.py localhost:5000
-
- -You can also use CURL or any other tools to query the server directly: - -
-curl 'http://localhost:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["Hello world"], "tokens_to_generate":1}'
-
- -See [megatron/text_generation_server.py](megatron/text_generation_server.py) for more API options. - -### Detoxify GPT via Self-generation -We include an example in `examples/detxoify_lm/` to detoxify language models by leveraging the generative power of language models. - -See [examples/detxoify_lm/README.md](examples/detxoify_lm/README.md) for step-by-step tutorials on how to perform domain-adaptive training and detoxify LM using self-generated corpus. - - -## GPT Evaluation -We include example scripts for GPT evaluation on WikiText perplexity evaluation and LAMBADA Cloze accuracy. - -### WikiText Perplexity Evaluation -For even comparison with prior works, we evaluate perplexity on the word-level [WikiText-103 test dataset](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip), and appropriately compute perplexity given the change in tokens when using our subword tokenizer. - -We use the following command to run WikiText-103 evaluation on a 345M parameter model. -
-TASK="WIKITEXT103"
-
-VALID_DATA=<wikitext path>.txt
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-
-COMMON_TASK_ARGS="--num-layers 24 \
-                  --hidden-size 1024 \
-                  --num-attention-heads 16 \
-                  --seq-length 1024 \
-                  --max-position-embeddings 1024 \
-                  --fp16 \
-                  --vocab-file $VOCAB_FILE"
-
-python tasks/main.py \
-       --task $TASK \
-       $COMMON_TASK_ARGS \
-       --valid-data $VALID_DATA \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file $MERGE_FILE \
-       --load $CHECKPOINT_PATH \
-       --micro-batch-size 8 \
-       --activations-checkpoint-method uniform \
-       --log-interval 10 \
-       --no-load-optim \
-       --no-load-rng
-
- - -### LAMBADA Cloze Accuracy -To compute LAMBADA cloze accuracy (the accuracy of predicting the last token given the preceding tokens) we utilize a detokenized, processed version of the [LAMBADA dataset](https://github.com/cybertronai/bflm/blob/master/lambada_test.jsonl). - -We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Make that `lambada` is part of the file path. - -
-TASK="LAMBADA"
-
-VALID_DATA=<lambada path>.json
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-COMMON_TASK_ARGS=<same as those in WikiText Perplexity Evaluation above>
-
-python tasks/main.py \
-       --task $TASK \
-       $COMMON_TASK_ARGS \
-       --valid-data $VALID_DATA \
-       --tokenizer-type GPT2BPETokenizer \
-       --strict-lambada \
-       --merge-file $MERGE_FILE \
-       --load $CHECKPOINT_PATH \
-       --micro-batch-size 8 \
-       --activations-checkpoint-method uniform \
-       --log-interval 10 \
-       --no-load-optim \
-       --no-load-rng
-
- -Further command line arguments are described in the source file [`main.py`](./tasks/main.py) - -## BERT Task Evaluation -### RACE Evaluation -The following script finetunes the BERT model for evaluation on the [RACE dataset](http://www.cs.cmu.edu/~glai1/data/race/). The `TRAIN_DATA` and `VALID_DATA` directory contain the RACE dataset as separate `.txt` files. Note that for RACE, the batch size is the number of RACE query's to evaluate. Since each RACE query has four samples, the effective batch size passed through the model will be four times the batch size specified on the command line. - -
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
-            data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-COMMON_TASK_ARGS="--num-layers 24 \
-                  --hidden-size 1024 \
-                  --num-attention-heads 16 \
-                  --seq-length 512 \
-                  --max-position-embeddings 512 \
-                  --fp16 \
-                  --vocab-file $VOCAB_FILE"
-
-COMMON_TASK_ARGS_EXT="--train-data $TRAIN_DATA \
-                      --valid-data $VALID_DATA \
-                      --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-                      --activations-checkpoint-method uniform \
-                      --save-interval 10000 \
-                      --save $CHECKPOINT_PATH \
-                      --log-interval 100 \
-                      --eval-interval 1000 \
-                      --eval-iters 10 \
-                      --weight-decay 1.0e-1"
-
-python tasks/main.py \
-       --task RACE \
-       $COMMON_TASK_ARGS \
-       $COMMON_TASK_ARGS_EXT \
-       --tokenizer-type BertWordPieceLowerCase \
-       --epochs 3 \
-       --micro-batch-size 4 \
-       --lr 1.0e-5 \
-       --lr-warmup-fraction 0.06
-
- -### MNLI Evaluation -The following script finetunes the BERT model for evaluation with the [MultiNLI sentence pair corpus](https://www.nyu.edu/projects/bowman/multinli/). Because the matching tasks are quite similar, the script can be quickly tweaked to work with the [Quora Question Pairs](https://www.kaggle.com/quora/question-pairs-dataset) (QQP) dataset as well. - -
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
-            data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-COMMON_TASK_ARGS=<same as those in RACE Evaluation above>
-COMMON_TASK_ARGS_EXT=<same as those in RACE Evaluation above>
-
-python tasks/main.py \
-       --task MNLI \
-       $COMMON_TASK_ARGS \
-       $COMMON_TASK_ARGS_EXT \
-       --tokenizer-type BertWordPieceLowerCase \
-       --epochs 5 \
-       --micro-batch-size 8 \
-       --lr 5.0e-5 \
-       --lr-warmup-fraction 0.065
-
- -# Datasets -We do not host any datasets for GPT or BERT training, however, we detail their collection so that our results may be reproduced. - -## Collecting Wikipedia Training Data -We recommend following the Wikipedia data extraction process specified by Google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text." - -We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset by nltk punctuation standardization. For BERT training, use the `--split-sentences` flag to `preprocess_data.py` as described [above](#data-preprocessing) to include sentence breaks in the produced index. If you'd like to use Wikipedia data for GPT training you should still clean it with nltk/spacy/ftfy, but do not use the `--split-sentences` flag. - -## Collecting GPT Webtext Data -We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filtered, cleaned, and deduplicated all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000000..728cdb4a1d2 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,25 @@ +## Security + +NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. + +If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** If a potential security issue is inadvertently reported via a public issue or pull request, NVIDIA maintainers may limit public discussion and redirect the reporter to the appropriate private disclosure channels. + +## Reporting Potential Security Vulnerability in an NVIDIA Product + +To report a potential security vulnerability in any NVIDIA product: + +- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) +- E-Mail: psirt@nvidia.com + - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) + - Please include the following information: + - Product/Driver name and version/branch that contains the vulnerability + - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) + - Instructions to reproduce the vulnerability + - Proof-of-concept or exploit code + - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability + +While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. + +## NVIDIA Product Security + +For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000000..aa37017f082 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +comment: false +coverage: + status: + project: false + patch: + default: + target: 80% + threshold: 5% + base: auto + if_ci_failed: error + if_no_uploads: success + if_not_found: success +fixes: + - "/opt/megatron-lm/::" diff --git a/docker/.ngc_version.dev b/docker/.ngc_version.dev new file mode 100644 index 00000000000..3356f1f0bca --- /dev/null +++ b/docker/.ngc_version.dev @@ -0,0 +1 @@ +nvcr.io/nvidia/pytorch:26.04-py3 diff --git a/docker/.ngc_version.lts b/docker/.ngc_version.lts new file mode 100644 index 00000000000..6b72812b34f --- /dev/null +++ b/docker/.ngc_version.lts @@ -0,0 +1 @@ +nvcr.io/nvidia/pytorch:25.09-py3 \ No newline at end of file diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev new file mode 100644 index 00000000000..377bd4a2005 --- /dev/null +++ b/docker/Dockerfile.ci.dev @@ -0,0 +1,103 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# syntax=docker/dockerfile:1.3-labs + +ARG FROM_IMAGE_NAME +FROM ${FROM_IMAGE_NAME} as main +ENV PIP_CONSTRAINT="" +ENV DEBIAN_FRONTEND=noninteractive +ARG UV_VERSION=0.7.2 +ARG YQ_VERSION=4.44.1 +ENV PATH="/root/.local/bin:$PATH" +ARG UV_PROJECT_ENVIRONMENT=/opt/venv +ENV UV_PROJECT_ENVIRONMENT=${UV_PROJECT_ENVIRONMENT} +ENV VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT +ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH" +ENV UV_LINK_MODE=copy + +RUN bash -ex <<"EOF" + apt-get update + apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime + apt-get clean + python -m venv /opt/jet + ARCH=$(uname -m) + case "${ARCH}" in \ + "x86_64") YQ_ARCH=amd64 ;; \ + "aarch64") YQ_ARCH=arm64 ;; \ + "armv7l") YQ_ARCH=arm ;; \ + *) echo "Unsupported architecture: ${ARCH}" && exit 1 ;; \ + esac + wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_${YQ_ARCH} -O /usr/local/bin/yq + chmod a+x /usr/local/bin/yq + curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh +EOF + +COPY README.md pyproject.toml uv.lock /workspace/ +COPY megatron/core/__init__.py /workspace/megatron/core/ +COPY megatron/core/package_info.py /workspace/megatron/core/ +ARG IMAGE_TYPE=dev +ENV NVTE_BUILD_NUM_PHILOX_ROUNDS=3 +RUN --mount=type=cache,target=/root/.cache/uv \ + bash -ex <<"EOF" + export NVTE_CUDA_ARCHS="80;90;100" + uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages + uv sync --only-group build + uv sync --extra ${IMAGE_TYPE} --extra mlm --extra ssm --extra te --link-mode copy --locked \ + --no-install-package torch \ + --no-install-package torchvision \ + --no-install-package triton \ + --no-install-package transformer-engine-cu12 \ + --no-install-package nvidia-cublas-cu12 \ + --no-install-package nvidia-cuda-cupti-cu12 \ + --no-install-package nvidia-cuda-nvrtc-cu12 \ + --no-install-package nvidia-cuda-runtime-cu12 \ + --no-install-package nvidia-cudnn-cu12 \ + --no-install-package nvidia-cufft-cu12 \ + --no-install-package nvidia-cufile-cu12 \ + --no-install-package nvidia-curand-cu12 \ + --no-install-package nvidia-cusolver-cu12 \ + --no-install-package nvidia-cusparse-cu12 \ + --no-install-package nvidia-cusparselt-cu12 \ + --no-install-package nvidia-nccl-cu12 +EOF + +# Install DeepEP +COPY docker/patches/deepep.patch /workspace/deepep.patch +RUN bash -ex <<"EOF" + cd /workspace + uv pip install nvidia-nvshmem-cu13==3.4.5 + pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/ + ln -s libnvshmem_host.so.3 libnvshmem_host.so + popd + + git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git + pushd DeepEP + git checkout 34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72 + patch -p1 < /workspace/deepep.patch + popd + TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. + rm -rf DeepEP +EOF + +COPY assets/ /opt/data/ +ENV UV_PYTHON=$UV_PROJECT_ENVIRONMENT/bin/python + +##### For NVIDIANS only ##### +FROM main as jet +ARG JET_API_VERSION +ENV PATH="$PATH:/opt/jet/bin" +RUN --mount=type=secret,id=JET_INDEX_URLS bash -ex <<"EOF" + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) + python -m venv /opt/jet + /opt/jet/bin/pip install --no-cache-dir $JET_INDEX_URLS \ + "jet-api==$JET_API_VERSION" "setuptools<82.0.0" +EOF + +RUN --mount=type=secret,id=JET_INDEX_URLS \ + --mount=type=secret,id=LOGGER_INDEX_URL bash -ex <<"EOF" + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) + LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL) + uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger" + uv pip install --no-cache-dir --upgrade "setuptools>=80" + uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=4.0" +EOF +### diff --git a/docker/Dockerfile.ci.nemo b/docker/Dockerfile.ci.nemo new file mode 100644 index 00000000000..b00349e101a --- /dev/null +++ b/docker/Dockerfile.ci.nemo @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# syntax=docker/dockerfile:1.3-labs + +ARG FROM_IMAGE_NAME +FROM ${FROM_IMAGE_NAME} as main + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gettext && \ + apt-get clean && \ + wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ + chmod a+x /usr/local/bin/yq + +##### For NVIDIANS only ##### +FROM main as jet +ARG JET_API_VERSION +RUN --mount=type=secret,id=JET_INDEX_URLS \ + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ + pip install --no-cache-dir jet-api==$JET_API_VERSION "jet-client~=4.0" --upgrade $JET_INDEX_URLS + +ENV PATH="$PATH:/opt/jet/bin" +### diff --git a/docker/Dockerfile.linting b/docker/Dockerfile.linting new file mode 100644 index 00000000000..259c0bbedcd --- /dev/null +++ b/docker/Dockerfile.linting @@ -0,0 +1,23 @@ +# syntax=docker/dockerfile:experimental + +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME as main +ENV DEBIAN_FRONTEND=noninteractive +ARG UV_VERSION=0.7.2 +ARG YQ_VERSION=4.44.1 +ENV PATH="/root/.local/bin:$PATH" +ENV UV_PROJECT_ENVIRONMENT=/opt/venv +ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH" +ENV UV_LINK_MODE=copy +RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh +WORKDIR /opt/megatron-lm +COPY pyproject.toml uv.lock /opt/megatron-lm/ +COPY megatron/core/package_info.py megatron/core/__init__.py /opt/megatron-lm/megatron/core/ +RUN uv sync --locked --only-group linting --only-group test --only-group ci + +##### For NVIDIANS only ##### +FROM main as jet +ARG JET_API_VERSION +RUN --mount=type=secret,id=JET_INDEX_URLS \ + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ + uv pip install --no-cache-dir "jet-client~=2.0" --upgrade $JET_INDEX_URLS diff --git a/docker/common/install.sh b/docker/common/install.sh new file mode 100644 index 00000000000..3991fd41448 --- /dev/null +++ b/docker/common/install.sh @@ -0,0 +1,146 @@ +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --base-image) + BASE_IMAGE="$2" + shift 2 + ;; + --python-version) + PYTHON_VERSION="$2" + shift 2 + ;; + --environment) + ENVIRONMENT="$2" + shift 2 + ;; + --use-uv) + USE_UV="true" + shift 1 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 --base-image {pytorch|ubuntu} [--use-uv] [--python-version] [--environment]" + exit 1 + ;; + esac +done + +if [[ -z "${PYTHON_VERSION:-}" ]]; then + PYTHON_VERSION="3.12" +fi + +if [[ -z "${USE_UV:-}" ]]; then + USE_UV="false" +fi + +# Validate base image argument +if [[ -z "${BASE_IMAGE:-}" || -z "${ENVIRONMENT:-}" ]]; then + echo "Error: --base-image argument is required" + echo "Usage: $0 --base-image {pytorch|ubuntu} --environment {dev|lts}" + exit 1 +fi + +if [[ "$BASE_IMAGE" != "pytorch" && "$BASE_IMAGE" != "ubuntu" ]]; then + echo "Error: --base-image must be either 'pytorch' or 'ubuntu'" + echo "Usage: $0 --base-image {pytorch|ubuntu}" + exit 1 +fi + +if [[ "$ENVIRONMENT" != "dev" && "$ENVIRONMENT" != "lts" ]]; then + echo "Error: --environment must be either 'dev' or 'lts'" + echo "Usage: $0 --environment {dev|lts}" + exit 1 +fi + +main() { + if [[ -n "${PAT:-}" ]]; then + echo -e "machine github.com\n login token\n password $PAT" >~/.netrc + chmod 600 ~/.netrc + fi + + # Install dependencies + export DEBIAN_FRONTEND=noninteractive + + # Install Python + apt-get update + apt-get install -y software-properties-common + add-apt-repository ppa:deadsnakes/ppa -y + apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python$PYTHON_VERSION 1 + + # Install tools + apt-get update + apt-get install -y wget curl git cmake + + # Install CUDA + if [[ "$BASE_IMAGE" == "ubuntu" ]]; then + rm /etc/apt/sources.list.d/cuda*.list || true + rm /etc/apt/sources.list.d/nvidia-cuda.list || true + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb + dpkg -i cuda-keyring_1.1-1_all.deb + rm cuda-keyring_1.1-1_all.deb + apt-get update + apt-get install -y cuda-toolkit-12-8 cudnn-cuda-12 libcudnn9-cuda-12 libcutlass-dev + fi + + # Clean up + apt-get clean + + unset PIP_CONSTRAINT + + if [[ "$USE_UV" == "true" ]]; then + if [[ "$BASE_IMAGE" == "pytorch" ]]; then + UV_ARGS=( + "--no-install-package" "torch" + "--no-install-package" "torchvision" + "--no-install-package" "triton" + "--no-install-package" "nvidia-cublas-cu12" + "--no-install-package" "nvidia-cuda-cupti-cu12" + "--no-install-package" "nvidia-cuda-nvrtc-cu12" + "--no-install-package" "nvidia-cuda-runtime-cu12" + "--no-install-package" "nvidia-cudnn-cu12" + "--no-install-package" "nvidia-cufft-cu12" + "--no-install-package" "nvidia-cufile-cu12" + "--no-install-package" "nvidia-curand-cu12" + "--no-install-package" "nvidia-cusolver-cu12" + "--no-install-package" "nvidia-cusparse-cu12" + "--no-install-package" "nvidia-cusparselt-cu12" + "--no-install-package" "nvidia-nccl-cu12" + ) + else + UV_ARGS=() + fi + + # Install uv + UV_VERSION="0.7.2" + curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh + + # Create virtual environment and install dependencies + uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages + + # Install dependencies + uv sync --locked --only-group build ${UV_ARGS[@]} + uv sync \ + --link-mode copy \ + --locked \ + --extra ${ENVIRONMENT} \ + --all-groups ${UV_ARGS[@]} + + # Install the package + uv pip install --no-deps -e . + else + python3 -m venv $UV_PROJECT_ENVIRONMENT + . $UV_PROJECT_ENVIRONMENT/bin/activate + + pip install --pre --no-cache-dir --upgrade pip + pip install --pre --no-cache-dir torch pybind11 wheel_stub ninja wheel packaging "setuptools>=80" + pip install --pre --no-cache-dir --no-build-isolation . + fi + +} + +# Call the main function +main "$@" diff --git a/docker/common/install_source_wheels.sh b/docker/common/install_source_wheels.sh new file mode 100644 index 00000000000..7eaaef2e46f --- /dev/null +++ b/docker/common/install_source_wheels.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +INPUT_WHEEL_DIR=$(pwd)/wheels + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --input-wheel-dir) + INPUT_WHEEL_DIR="$2" + shift 2 + ;; + --environment) + ENVIRONMENT="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 --input-wheel-dir DIR" + exit 1 + ;; + esac +done + +# Check if required arguments are provided +if [ -z "$INPUT_WHEEL_DIR" ] || [ -z "$ENVIRONMENT" ]; then + echo "Error: --input-wheel-dir and --environment are required" + echo "Usage: $0 --input-wheel-dir DIR --environment ENV" + exit 1 +fi + +if [ "$ENVIRONMENT" = "dev" ]; then + TE_WHEEL=$(ls $INPUT_WHEEL_DIR/transformer_engine*.whl) || true + [ -z "$TE_WHEEL" ] && TE_WHEEL=$(bash docker/common/build_te.sh --output-wheel-dir $INPUT_WHEEL_DIR | tail -n 1) +fi + +MAMBA_WHEEL=$(ls $INPUT_WHEEL_DIR/mamba*.whl) || true +[ -z "$MAMBA_WHEEL" ] && MAMBA_WHEEL=$(bash docker/common/build_mamba.sh --output-wheel-dir $INPUT_WHEEL_DIR | tail -n 1) + +CAUSALCONV1D_WHEEL=$(ls $INPUT_WHEEL_DIR/causal_conv1d*.whl) || true +[ -z "$CAUSALCONV1D_WHEEL" ] && CAUSALCONV1D_WHEEL=$(bash docker/common/build_causalconv1d.sh --output-wheel-dir $INPUT_WHEEL_DIR | tail -n 1) + +# Override deps that are already present in the base image +# only for dev +if [ "$ENVIRONMENT" = "dev" ]; then + uv pip install --no-cache-dir --no-deps $TE_WHEEL +fi + +# Install heavy optional deps like mamba, causalconv1d +uv pip install --no-cache-dir \ + $MAMBA_WHEEL \ + $CAUSALCONV1D_WHEEL \ + "setuptools>=80" diff --git a/docker/patches/deepep.patch b/docker/patches/deepep.patch new file mode 100644 index 00000000000..070e4581481 --- /dev/null +++ b/docker/patches/deepep.patch @@ -0,0 +1,13 @@ +diff --git a/setup.py b/setup.py +index 63ce332..4e13462 100644 +--- a/setup.py ++++ b/setup.py +@@ -37,7 +37,7 @@ if __name__ == '__main__': + '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes'] + nvcc_flags = ['-O3', '-Xcompiler', '-O3'] + sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu'] +- include_dirs = ['csrc/'] ++ include_dirs = ['csrc/', '/usr/local/cuda/include/cccl/'] + library_dirs = [] + nvcc_dlink = [] + extra_link_args = [] diff --git a/docs/add_copyright_header.py b/docs/add_copyright_header.py new file mode 100644 index 00000000000..9694ef84819 --- /dev/null +++ b/docs/add_copyright_header.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""One-off script to add NVIDIA copyright header to all .md files under docs/.""" + +from pathlib import Path + +HEADER = """ Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved. + NVIDIA CORPORATION and its licensors retain all intellectual property + and proprietary rights in and to this software, related documentation + and any modifications thereto. Any use, reproduction, disclosure or + distribution of this software and related documentation without an express + license agreement from NVIDIA CORPORATION is strictly prohibited. + +""" + +def main(): + docs_dir = Path(__file__).resolve().parent + already_has = "Copyright (c) 2022-2026, NVIDIA CORPORATION" + count = 0 + for path in sorted(docs_dir.rglob("*.md")): + content = path.read_text(encoding="utf-8") + if content.strip().startswith(already_has): + continue + new_content = HEADER + content + path.write_text(new_content, encoding="utf-8") + count += 1 + print(path.relative_to(docs_dir)) + print(f"\nUpdated {count} files.") + +if __name__ == "__main__": + main() diff --git a/docs/advanced/index.md b/docs/advanced/index.md new file mode 100644 index 00000000000..98ff0806cff --- /dev/null +++ b/docs/advanced/index.md @@ -0,0 +1,14 @@ + + +# Discussions + +In-depth technical discussions and optimization guides: + +- [Optimizing DeepSeek-V3 Training on GB200 NVL72](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md) - Achieving 970 TFLOPS/GPU with MXFP8, kernel optimizations, and HybridEP diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md new file mode 100644 index 00000000000..40f56ec0c00 --- /dev/null +++ b/docs/api-backwards-compatibility-check.md @@ -0,0 +1,344 @@ +--- +orphan: true +--- + + + +# API Backward Compatibility Checking + +## Overview + +Megatron Core uses automated API compatibility checking to ensure stable interfaces between releases. This prevents accidental breaking changes that could affect users upgrading between versions. + +## How It Works + +The compatibility checker: +1. Compares the current code against the latest release +2. Detects breaking changes in function signatures +3. Fails CI if breaking changes are found (unless explicitly exempted) +4. Runs automatically on every PR that modifies `megatron/core` + +## What Gets Checked + +### ✅ Breaking Changes Detected + +- **Parameter removed** - Removing a function parameter +- **Parameter added without default** - Adding a required parameter +- **Parameter order changed** - Changing the order of parameters +- **Optional→Required** - Removing a default value from a parameter +- **Function removed** - Deleting a public function +- **Return type changed** - Changing the return type annotation (warning) + +### ⏭️ What Gets Skipped + +- **Test functions** - Functions starting with `test_` +- **Exempt decorators** - Functions marked with `@internal_api`, `@experimental_api`, or `@deprecated` +- **Excluded paths** - Code in `tests/`, `experimental/`, `legacy/` + +### ✅ Allowed Changes + +- **Adding optional parameters** - Adding parameters with default values +- **Adding new functions** - New public APIs +- **Making parameters optional** - Adding default values to required parameters + +## For Developers + +### Running Locally + +```bash +# Install griffe +pip install griffe + +# Check against latest release +python scripts/check_api_backwards_compatibility.py --baseline core_r0.8.0 + +# Check with verbose output +python scripts/check_api_backwards_compatibility.py --baseline core_r0.8.0 -v + +# Compare two specific branches +python scripts/check_api_backwards_compatibility.py --baseline core_r0.8.0 --current main +``` + +### Marking Functions as Exempt + +If you need to make breaking changes to internal or experimental APIs: + +#### Internal API (for internal implementation details) + +```python +from megatron.core.utils import internal_api + +@internal_api +def experimental_feature(x, y): + """ + This API is experimental and may change. + NOT FOR EXTERNAL USE. + """ + pass +``` + +**When to use `@internal_api`:** +- Internal APIs not documented for external use +- Experimental features explicitly marked as unstable +- Functions in development that haven't been released yet + +#### Experimental API (for experimental features) + +```python +from megatron.core.utils import experimental_api + +@experimental_api +def new_experimental_feature(x, y): + """ + This API is experimental and may change without notice. + """ + pass +``` + +**When to use `@experimental_api`:** +- Experimental features explicitly marked as unstable +- New APIs under active development +- Features that haven't been stabilized yet + +### Deprecating APIs + +For planned API changes, use the deprecation workflow: + +```python +from megatron.core.backwards_compatibility_decorators import deprecated + +@deprecated( + version="1.0.0", # When deprecation starts + removal_version="2.0.0", # When it will be removed + alternative="new_function", # Recommended replacement + reason="Improved performance and cleaner API" +) +def old_function(x): + """This function is deprecated.""" + pass +``` + +**Deprecation Timeline:** +1. **Version N** - Add `@deprecated` decorator, function still works +2. **Version N+1** - Keep function with deprecation warnings +3. **Version N+2** - Remove function (users have been warned) + +### Handling CI Failures + +If the compatibility check fails on your PR: + +1. **Review the breaking changes** in the CI logs +2. **Choose an action:** + - **Fix the code** - Revert the breaking change + - **Add exemption** - Use `@internal_api` if intentional + - **Use deprecation** - For planned API changes +3. **Update your PR** with the fix + +## Examples + +### Example 1: Compatible Change + +```python +# ✅ BEFORE (v1.0) +def train_model(config, dataloader): + pass + +# ✅ AFTER (v1.1) - Added optional parameter +def train_model(config, dataloader, optimizer="adam"): + pass +``` +**Result:** ✅ Check passes + +--- + +### Example 2: Breaking Change + +```python +# BEFORE (v1.0) +def train_model(config, dataloader, optimizer="adam"): + pass + +# ❌ AFTER (v1.1) - Removed parameter +def train_model(config, dataloader): + pass +``` +**Result:** ❌ Check fails - "Parameter 'optimizer' removed" + +--- + +### Example 3: Exempt Internal API + +```python +from megatron.core.utils import internal_api + +# BEFORE (v1.0) +@internal_api +def _internal_compute(x, y): + pass + +# ✅ AFTER (v1.1) - Can change freely +@internal_api +def _internal_compute(x, y, z): # Added parameter + pass +``` +**Result:** ✅ Check passes (function is exempt) + +--- + +### Example 4: Deprecation Workflow + +```python +from megatron.core.utils import deprecated + +# Version 1.0 - Add deprecation +@deprecated( + version="1.0.0", + removal_version="2.0.0", + alternative="train_model_v2" +) +def train_model(config): + """Old training function - DEPRECATED""" + pass + +def train_model_v2(config, **options): + """New improved training function""" + pass + +# Version 1.1 - Keep both (users migrate) +# Version 2.0 - Remove train_model() +``` + +## Architecture + +``` +Developer commits code + ↓ +GitHub Actions triggers + ↓ +CI runs check_api_backwards_compatibility.py + ↓ +Script loads code via griffe: + • Baseline: latest release (e.g., core_r0.8.0) + • Current: PR branch + ↓ +Apply filtering: + • Skip @internal_api, @experimental_api, and @deprecated + • Skip private functions (_prefix) + • Skip test/experimental paths + ↓ +Griffe compares signatures: + • Parameters + • Types + • Return types + • Defaults + ↓ +Report breaking changes + ↓ +Exit: 0=pass, 1=fail + ↓ +CI fails if breaking changes detected +``` + +## Configuration + +### Customizing Filters + +Edit `scripts/check_api_backwards_compatibility.py`: + +```python +# Add more exempt decorators +EXEMPT_DECORATORS = [ + "internal_api", + "experimental_api", + "deprecated", +] + +# Add more path exclusions +EXCLUDE_PATHS = { + "tests", + "experimental", + "legacy", + "your_custom_path", # ← Add here +} +``` + +### Changing the Baseline + +The workflow auto-detects the latest `core_r*` tag. To manually specify: + +```yaml +# In .github/workflows/check_api_backwards_compatibility_workflow.yml +- name: Run compatibility check + run: | + python scripts/check_api_backwards_compatibility.py \ + --baseline your_custom_baseline +``` + +## FAQ + +### Q: Why did my PR fail the compatibility check? + +**A:** Your code introduced breaking changes compared to the last release. Review the CI logs to see what changed. + +### Q: Can I disable the check for my PR? + +**A:** No, but you can mark specific functions as exempt using `@internal_api` or `@experimental_api`. + +### Q: What if I need to make a breaking change? + +**A:** Use the `@deprecated` decorator for a gradual transition, or mark the function as exempt using `@internal_api` (for internal code) or `@experimental_api` (for experimental features). + +### Q: Does this check all of Megatron-LM? + +**A:** No, only `megatron/core/**` (Megatron Core). Legacy code is excluded. + +### Q: What about class methods? + +**A:** Yes, class methods are checked just like functions. + +### Q: Can I run this locally before pushing? + +**A:** Yes! Run `python scripts/check_api_backwards_compatibility.py --baseline core_r0.8.0` + +### Q: What if there's no release tag yet? + +**A:** The workflow will use `main` as the baseline. Update it once you have release tags. + +## Troubleshooting + +### Error: "griffe is not installed" + +```bash +pip install griffe +``` + +### Error: "No core_r* tags found" + +The repository doesn't have release tags yet. The workflow will fall back to `main`. + +### False Positives + +If the checker reports a breaking change that isn't actually breaking, file an issue and use `@internal_api` as a temporary workaround. + +## References + +- **Script:** `scripts/check_api_backwards_compatibility.py` +- **Workflow:** `.github/workflows/check_api_backwards_compatibility_workflow.yml` +- **Decorators:** `megatron/core/backwards_compatibility_decorators.py` +- **Griffe Documentation:** https://mkdocstrings.github.io/griffe/ + +## Support + +For questions or issues: +1. Check this documentation +2. Review existing PRs with compatibility checks +3. Ask in the Megatron-LM Slack/Discord +4. File an issue on GitHub + diff --git a/docs/api-guide/core/datasets.md b/docs/api-guide/core/datasets.md new file mode 100644 index 00000000000..d80c0183375 --- /dev/null +++ b/docs/api-guide/core/datasets.md @@ -0,0 +1,13 @@ + + +# datasets package + +```{include} ../../../megatron/core/datasets/readme.md +``` diff --git a/docs/api-guide/core/dist_checkpointing.md b/docs/api-guide/core/dist_checkpointing.md new file mode 100644 index 00000000000..c3dfc7aa257 --- /dev/null +++ b/docs/api-guide/core/dist_checkpointing.md @@ -0,0 +1,175 @@ + + +# dist_checkpointing package + +A library for saving and loading the distributed checkpoints. +A *distributed checkpoint* in Megatron Core uses the ``torch_dist`` format, +a custom checkpointing mechanism built on top of PyTorch's native +checkpointing capabilities. + +A key property of distributed checkpoints is that a checkpoint saved under one +parallel configuration (tensor, pipeline, or data parallelism) can be loaded +under a different parallel configuration. This enables flexible scaling and +resharding of models across heterogeneous training setups. + +Using the library requires defining sharded state_dict dictionaries with functions from *mapping* and *optimizer* modules. +Those state dicts can be saved or loaded with a *serialization* module using strategies from *strategies* module. + +## Safe Checkpoint Loading + +Since **PyTorch 2.6**, the default behavior of `torch.load` is `weights_only=True`. +This ensures that only tensors and allow-listed classes are loaded, reducing the risk of arbitrary code execution. + +If you encounter an error such as: + +```bash +WeightsUnpickler error: Unsupported global: GLOBAL argparse.Namespace was not an allowed global by default. +``` + +you can fix it by explicitly allow-listing the missing class in your script: + +```python +import torch, argparse + +torch.serialization.add_safe_globals([argparse.Namespace]) +``` + +## Checkpointing Distributed Optimizer + +### Checkpoint Compatibility and Optimizer State Formats + +Beginning with **mcore v0.14**, the ``flattened_range`` attribute was removed from ``dist_checkpointing``. As a result: + +- Optimizer states saved with mcore versions <= 0.14 can no longer be loaded directly. Loading these legacy optimizer states is not supported because the required sharded metadata is no longer available. If you need to continue training from older checkpoints, refer to the workaround described below. +- Model weights from older checkpoints remain fully compatible. No extra steps are needed—model weights from checkpoints created by earlier versions load automatically; simply add the ``--no-load-optim`` flag. + +### Workaround: Loading legacy optimizer states with ToT MCore + +**Step 1: Convert the legacy checkpoint using mcore v0.15.0** + +Run a dummy training job with mcore v0.15.0 to re-save the checkpoint with new optimizer states format. + +```bash +MODEL_TRAIN_PARAMS=( + # Define model architecture and training parameters here +) +OLD_CKPT=/workspace/mcore_ckpt_old +CONVERTED_CKPT=/workspace/mcore_ckpt_0.15.0 + +torchrun --nproc_per_node=8 /opt/megatron-lm/pretrain_gpt.py \ + --save-interval 1 \ + --eval-interval 1 \ + --exit-interval 1 \ + --eval-iters 1 \ + --use-distributed-optimizer \ + --save ${CONVERTED_CKPT} \ + --load ${OLD_CKPT} \ + --ckpt-format torch_dist \ + "${MODEL_TRAIN_PARAMS[@]}" +``` + +**Step 2: Load the converted checkpoint with ToT MCore** + +Use the converted checkpoint as the input for continued training with ToT MCore. + +```bash +MODEL_TRAIN_PARAMS=( + # Define model architecture and training parameters here +) +NEW_CKPT=/workspace/mcore_ckpt_new +CONVERTED_CKPT=/workspace/mcore_ckpt_0.15.0 + +torchrun --nproc_per_node=8 /opt/megatron-lm/pretrain_gpt.py \ + --use-distributed-optimizer \ + --save ${NEW_CKPT} \ + --load ${CONVERTED_CKPT} \ + --ckpt-format torch_dist \ + "${MODEL_TRAIN_PARAMS[@]}" +``` + +After this step, training can proceed normally using ToT MCore with fully supported optimizer state loading. + +## Distributed Optimizer Checkpoint Formats + +The refactor of the Distributed Optimizer introduces **two checkpoint formats**: + +- dp_reshardable (Default) + - Fast save/load performance. + - Not reshardable — not possible to change model parallelism when using this format. + - Recommended for general training when model parallelism changes are not needed. +- fully_reshardable + - Fully reshardable — supports arbitrary changes in model parallelism. + - Slower than dp_reshardable. + - Enabled via the ``--dist-ckpt-optim-fully-reshardable`` flag. + +### Workflow for Changing Model Parallelism + +You can combine formats to optimize both flexibility and performance: + + 1. Train using ``dp_reshardable`` (default) for faster checkpointing. + 2. When you need to change model parallelism: + + - Stop training. + - Change model parallelism for train config. + - Resume training with ``--dist-ckpt-optim-fully-reshardable``. + + 3. Save at least one checkpoint under the new model parallel configuration. + 4. (Optional) To continue the training with updated model parallelism and better checkpointing performance, stop training and switch back to ``dp_reshardable`` format by removing ``--dist-ckpt-optim-fully-reshardable``. + +## Async Checkpoint Saving Strategy + +The framework supports asynchronous checkpoint saving to improve training performance by offloading I/O operations. + +We are transitioning to a new async saving implementation based on the **NVRx (NVIDIA Resiliency Extension)** package. The legacy async strategy (referred to as **mcore**) is being deprecated. + +### Migration to NVRx + +- The **NVRx-based async strategy** will become the **default** in mcore v0.17. +- The existing **mcore async strategy** is **deprecated** and will be removed in future versions. +- A deprecation warning is emitted when using the mcore strategy. + +### Selecting Async Strategy + +`--async-strategy` flag is introduced to control the async strategy. To use legacy async strategy (**mcore**), set: + +```bash +--async-strategy mcore +``` + +### NVRx Dependency + +To use the NVRx-based async strategy, you must install the `nvidia-resiliency-ext` package. + +```bash +git clone https://github.com/NVIDIA/nvidia-resiliency-ext +cd nvidia-resiliency-ext +pip install . +``` + +> NOTE + +- If `nvidia-resiliency-ext` is not installed, the NVRx async strategy will not be available. +- The `mcore` strategy will remain temporarily to ensure a smooth transition but will be removed in future releases. +- It is strongly recommended to migrate to the NVRx strategy as soon as possible. + +### Async Saving for `fsdp_dtensor` and `torch_dcp` checkpoints + +Starting from mcore v0.17, asynchronous checkpoint saving is supported for `fsdp_dtensor` and `torch_dcp` formats. + +Note that async saving for these formats requires the `nvidia-resiliency-ext` package. As a result, the only supported `async_strategy` in this context is `nvrx`. + +## Subpackages + +```{toctree} +:maxdepth: 4 + +dist_checkpointing.strategies +``` + diff --git a/docs/api-guide/core/dist_checkpointing.strategies.md b/docs/api-guide/core/dist_checkpointing.strategies.md new file mode 100644 index 00000000000..22fe3517a54 --- /dev/null +++ b/docs/api-guide/core/dist_checkpointing.strategies.md @@ -0,0 +1,16 @@ + + +# dist_checkpointing.strategies package + +Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies). + +Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats. +Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure. + diff --git a/docs/api-guide/core/distributed.md b/docs/api-guide/core/distributed.md new file mode 100644 index 00000000000..13da4285ec5 --- /dev/null +++ b/docs/api-guide/core/distributed.md @@ -0,0 +1,19 @@ + + +# distributed package + +This package contains various utilities to finalize model weight gradients +on each rank before the optimizer step. This includes a distributed data +parallelism wrapper to all-reduce or reduce-scatter the gradients across +data-parallel replicas, and a `finalize_model_grads` method to +synchronize gradients across different parallelism modes (e.g., 'tied' +layers on different pipeline stages, or gradients for experts in a MoE on +different ranks due to expert parallelism). + diff --git a/docs/api-guide/core/fusions.md b/docs/api-guide/core/fusions.md new file mode 100644 index 00000000000..fdd358e813c --- /dev/null +++ b/docs/api-guide/core/fusions.md @@ -0,0 +1,20 @@ + + +# fusions package + +This package provides modules that provide commonly fused +operations. Fusing operations improves compute efficiency by +increasing the amount of work done each time a tensor is read from +memory. To perform the fusion, modules in this either rely on PyTorch +functionality for doing just-in-time compilation +(i.e. `torch.jit.script` in older PyTorch versions of `torch.compile` +in recent versions), or call into custom kernels in external libraries +such as Apex or TransformerEngine. + diff --git a/docs/api-guide/core/index.md b/docs/api-guide/core/index.md new file mode 100644 index 00000000000..0d39e46e744 --- /dev/null +++ b/docs/api-guide/core/index.md @@ -0,0 +1,25 @@ + + +# Core APIs + +Low-level API reference for core Megatron components. + +```{toctree} +:maxdepth: 2 + +transformer +tensor_parallel +pipeline_parallel +fusions +distributed +datasets +dist_checkpointing +dist_checkpointing.strategies +``` diff --git a/docs/api-guide/core/pipeline_parallel.md b/docs/api-guide/core/pipeline_parallel.md new file mode 100644 index 00000000000..35f3c5b5cc2 --- /dev/null +++ b/docs/api-guide/core/pipeline_parallel.md @@ -0,0 +1,16 @@ + + +# pipeline_parallel package + +This package contains implementations for two different pipeline parallelism +schedules (one without interleaving and one with interleaving, see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) +for details), and a default no-pipelining schedule. It also contains methods +for the point-to-point communication that is needed between pipeline stages. + diff --git a/docs/api-guide/core/tensor_parallel.md b/docs/api-guide/core/tensor_parallel.md new file mode 100644 index 00000000000..2d41c5f4467 --- /dev/null +++ b/docs/api-guide/core/tensor_parallel.md @@ -0,0 +1,17 @@ + + +# tensor_parallel package + +This package contains an implementation for tensor parallelism in transformer +models (see [Megatron-LM: Training Multi-Billion Parameter Language Models +Using Model Parallelism](https://arxiv.org/abs/1909.08053) and [Reducing +Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198) +for details). + diff --git a/docs/api-guide/core/transformer.md b/docs/api-guide/core/transformer.md new file mode 100644 index 00000000000..03bc0f501f4 --- /dev/null +++ b/docs/api-guide/core/transformer.md @@ -0,0 +1,19 @@ + + +# transformer package + +The `transformer` package provides a customizable and configurable +implementation of the transformer model architecture. Each component +of a transformer stack, from entire layers down to individual linear +layers, can be customized by swapping in different PyTorch modules +using the "spec" parameters. The +configuration of the transformer (hidden size, number of layers, +number of attention heads) is provided using a `TransformerConfig` +object. diff --git a/docs/api-guide/index.md b/docs/api-guide/index.md new file mode 100644 index 00000000000..7afa2450dd0 --- /dev/null +++ b/docs/api-guide/index.md @@ -0,0 +1,20 @@ + + +# API Guide + +API reference documentation for Megatron Core components. + +```{toctree} +:maxdepth: 3 + +models/index +core/index +internal/index +``` diff --git a/docs/api-guide/internal/index.md b/docs/api-guide/internal/index.md new file mode 100644 index 00000000000..312081ce70b --- /dev/null +++ b/docs/api-guide/internal/index.md @@ -0,0 +1,19 @@ + + +# Internal Utilities + +Internal utility APIs. + +```{toctree} +:maxdepth: 2 + +num_microbatches_calculator +optimizer_param_scheduler +``` diff --git a/docs/api-guide/internal/num_microbatches_calculator.md b/docs/api-guide/internal/num_microbatches_calculator.md new file mode 100644 index 00000000000..0c223588ce5 --- /dev/null +++ b/docs/api-guide/internal/num_microbatches_calculator.md @@ -0,0 +1,13 @@ + + +# Microbatches Calculator + +This api is used to calculate the number of microbatches required to fit a given model on a given batch size. + diff --git a/docs/api-guide/internal/optimizer_param_scheduler.md b/docs/api-guide/internal/optimizer_param_scheduler.md new file mode 100644 index 00000000000..45e5e4b7da1 --- /dev/null +++ b/docs/api-guide/internal/optimizer_param_scheduler.md @@ -0,0 +1,13 @@ + + +# Optimizer Parameters Scheduler + +This api is used to calculate the learning rate and weight decay for the optimizer. + diff --git a/docs/api-guide/models/index.md b/docs/api-guide/models/index.md new file mode 100644 index 00000000000..e5bb531454b --- /dev/null +++ b/docs/api-guide/models/index.md @@ -0,0 +1,21 @@ + + +# Model APIs + +API reference for Megatron Core model implementations. + +```{toctree} +:maxdepth: 2 + +models +models.gpt +models.bert +models.t5 +``` diff --git a/docs/api-guide/models/models.bert.md b/docs/api-guide/models/models.bert.md new file mode 100644 index 00000000000..1543f4df865 --- /dev/null +++ b/docs/api-guide/models/models.bert.md @@ -0,0 +1,13 @@ + + +# models.bert package + +Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . + diff --git a/docs/api-guide/models/models.gpt.md b/docs/api-guide/models/models.gpt.md new file mode 100644 index 00000000000..1c3cbb5484c --- /dev/null +++ b/docs/api-guide/models/models.gpt.md @@ -0,0 +1,13 @@ + + +# models.gpt package + +This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. + diff --git a/docs/api-guide/models/models.md b/docs/api-guide/models/models.md new file mode 100644 index 00000000000..a633546f0c9 --- /dev/null +++ b/docs/api-guide/models/models.md @@ -0,0 +1,23 @@ + + +# models package + +This package contains most of the popular LLMs . Currently we have support for GPT, Bert, and T5 . This is an ever growing list so keep an eye out. + +## Subpackages + +```{toctree} +:maxdepth: 4 + +models.gpt +models.t5 +models.bert +``` + diff --git a/docs/api-guide/models/models.t5.md b/docs/api-guide/models/models.t5.md new file mode 100644 index 00000000000..4694b80113a --- /dev/null +++ b/docs/api-guide/models/models.t5.md @@ -0,0 +1,11 @@ + + +# models.t5 package + diff --git a/docs/api-guide/router_replay.md b/docs/api-guide/router_replay.md new file mode 100644 index 00000000000..88476ea7537 --- /dev/null +++ b/docs/api-guide/router_replay.md @@ -0,0 +1,186 @@ + + +# Design Document: MoE Router Replay Feature + +## 1. Overview + +This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models. + +This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation. + +## 2. Motivation + +* **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results. +* **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves. +* **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations. + +## 3. Design and Architecture + +The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user. + +* **Core Components**: + * `RouterReplay` (located in `megatron/core/transformer/moe/router_replay.py`): A utility class for replaying MoE routing decisions. When enabled via the `moe_enable_routing_replay` flag, a separate instance of `RouterReplay` is created for each MoE layer's router. Each instance is responsible for loading routing data and providing the deterministic routing decisions for its corresponding layer during the forward pass. + * `moe_enable_routing_replay` (located in `megatron/core/transformer/transformer_config.py`): A boolean global configuration flag that serves as the sole entry point for enabling this feature. + +* **Workflow**: + The feature supports different modes, such as recording and replaying, controlled by a `RouterReplayAction`. + + 1. **Enabling the Feature**: The user sets `moe_enable_routing_replay` to `True` in the model configuration. + 2. **Initialization**: When `moe_enable_routing_replay` is true, each `TopKRouter` creates its own `RouterReplay` instance. + 3. **Mode Configuration**: The user must programmatically set the desired router replay action (e.g., `record`, `forward_replay`, `backward_replay`) on the `RouterReplay` instances. + 4. **Execution Flow (within a mini-batch)**: + * **Forward Pass**: + * For each micro-batch, the `topk_routing_with_score_function` checks the `router_replay_action`. + * **In `record` mode**: The dynamically computed `top-k` expert indices are captured and stored. + * **In `forward_replay` mode**: The function retrieves pre-loaded expert indices from `target_topk_idx`. These indices are used for the forward computation and are also appended to the `replay_backward_list` to prepare for the backward pass. + * **Backward Pass**: + * For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again. + * **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness. + +## 4. Implementation Details + +The implementation cleanly separates the replay logic from the router's core computation. + +* **`megatron/core/transformer/transformer_config.py`**: + * Adds the configuration option `moe_enable_routing_replay: bool = False`. + +* **`megatron/core/transformer/moe/moe_utils.py`**: + * Introduces the `RouterReplay` class to manage the state for recording and replaying routing decisions for a single MoE layer. + * `target_topk_idx`: An attribute holding the expert indices for the current micro-batch during forward replay mode. + * `recorded_topk_idx`: An attribute for storing the computed expert indices when in record mode. + * `replay_backward_list`: A list that accumulates the `top-k` indices used during the forward passes of a mini-batch. This list is consumed in FIFO order during the backward pass to ensure correctness under pipeline parallelism. + * `set_target_indices()`: A method to load the replay indices into `target_topk_idx` for the forward pass. + * `record_indices()`: A method to save the computed indices. + * The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing. + +### Training recompute usage + +- During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation. +- During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence. + +## 5. Usage Guide + +1. **Enable & Instantiate** + - Create one `RouterReplay` instance per MoE router layer when building the model. + - Optionally use the global helpers to set/clear actions across all layers. +2. **Record Routing Decisions** + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)`. + - Run the model; retrieve per-layer indices via `RouterReplay.get_recorded_data()` and persist. +3. **Forward Replay** + - Load indices and distribute: `RouterReplay.set_replay_data(list_of_tensors)`. + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)`. + - Run the model; dynamic top‑k is bypassed and target indices are used. +4. **Backward Replay** + - For training recomputation (activation checkpointing or pipeline recompute), set action: `REPLAY_BACKWARD` during recomputation. + - Per micro‑batch indices are consumed from `replay_backward_list` in FIFO order. +5. **Cleanup** + - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks. + +### Quick usage with `topk_routing_with_score_function` + +```python +import torch +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction +from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function + +rr = RouterReplay() + +# Record +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) +logits = torch.randn(8, 16) +probs_rec, routing_map_rec = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) +recorded = rr.get_recorded_indices() +torch.save(recorded, "/tmp/replay.pt") + +# Forward replay +rr.clear_router_replay_action() +rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD) +target = torch.load("/tmp/replay.pt") +rr.set_target_indices(target) +probs_rep, routing_map_rep = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) + +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +``` + +## 6. Minimal Demo + +Here is a minimal code example showing how to use RouterReplay for recording and replaying: + +```python +import torch +import torch.distributed as dist +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.moe.router import TopKRouter +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction + + +# Initialize distributed training +if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + +# Create a transformer config with RouterReplay enabled +config = TransformerConfig( + num_experts=8, + expert_model_parallel_size=1, + num_top_k=2, + moe_enable_routing_replay=True +) + +# Create a TopKRouter instance +router = TopKRouter(config) + +# Generate sample input (batch_size, sequence_length, hidden_size) +logits = torch.randn(16, 32, 8).to(torch.cuda.current_device()) + +# ----------------- +# 1. Recording Mode +# ----------------- +print("=== Recording Mode ===") +# Set global router replay action to RECORD +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) + +# Perform routing +routing_output = router.forward(logits) +print(f"Recorded top-k indices shape: {routing_output.top_k_idx.shape}") + +# ----------------- +# 2. Forward Replay Mode +# ----------------- +print("\n=== Forward Replay Mode ===") +# Save recorded indices to a file +torch.save(routing_output.top_k_idx, "/tmp/replay.pt") + +# Load indices from file and set as target for replay +replay_indices = torch.load("/tmp/replay.pt") +for router_instance in RouterReplay.global_router_replay_instances: + router_instance.target_topk_idx = replay_indices + +# Set global router replay action to REPLAY_FORWARD +RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + +# Perform routing again - this will use the replayed indices +replay_routing_output = router.forward(logits) +print(f"Replayed top-k indices shape: {replay_routing_output.top_k_idx.shape}") +print(f"Are indices the same? {torch.equal(routing_output.top_k_idx, replay_routing_output.top_k_idx)}") + + +# Clean up +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +if dist.is_initialized(): + dist.destroy_process_group() +``` diff --git a/docs/autodoc2_docstrings_parser.py b/docs/autodoc2_docstrings_parser.py new file mode 100644 index 00000000000..14b722de65b --- /dev/null +++ b/docs/autodoc2_docstrings_parser.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from docutils import nodes +from myst_parser.parsers.sphinx_ import MystParser +from sphinx.ext.napoleon.docstring import GoogleDocstring + + +class NapoleonParser(MystParser): + """Add support for Google style docstrings.""" + + def parse(self, input_string: str, document: nodes.document) -> None: + """Parse Google style docstrings.""" + + # Get the Sphinx configuration + config = document.settings.env.config + + # Process with Google style + google_parsed = str(GoogleDocstring(input_string, config)) + + return super().parse(google_parsed, document) + + +Parser = NapoleonParser diff --git a/docs/broken_links_false_positives.json b/docs/broken_links_false_positives.json new file mode 100644 index 00000000000..01377be5804 --- /dev/null +++ b/docs/broken_links_false_positives.json @@ -0,0 +1,3 @@ +{ + "uri": "http://localhost:8080/" +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000000..c18f453490d --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,133 @@ +# Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import sys + + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "Megatron Core" +copyright = "2026, NVIDIA Corporation" +author = "NVIDIA Corporation" +release = "nightly" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "myst_parser", # For our markdown docs + "sphinx.ext.viewcode", # For adding a link to view source code in docs + "sphinx.ext.doctest", # Allows testing in docstrings + "sphinx.ext.napoleon", # For google style docstrings + "sphinx_copybutton", # For copy button in code blocks +] + +# Check if we should skip autodoc generation +# usage: SKIP_AUTODOC=true +skip_autodoc = os.environ.get("SKIP_AUTODOC", "false").lower() == "true" + +if not skip_autodoc: + extensions.append("autodoc2") # Generates API docs + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# -- Options for MyST Parser (Markdown) -------------------------------------- +# MyST Parser settings +myst_enable_extensions = [ + "dollarmath", # Enables dollar math for inline math + "amsmath", # Enables LaTeX math for display mode + "colon_fence", # Enables code blocks using ::: delimiters instead of ``` + "deflist", # Supports definition lists with term: definition format + "fieldlist", # Enables field lists for metadata like :author: Name + "tasklist", # Adds support for GitHub-style task lists with [ ] and [x] + "attrs_block", # Enables setting attributes on block elements using {#id .class key=val} +] +myst_heading_anchors = 5 # Generates anchor links for headings up to level 5 + +# Suppress "more than one target found for cross-reference" warnings for Python symbols +# that have the same name across multiple modules (e.g. DistributedDataParallelConfig, +# ModelType). These are structural ambiguities in the codebase – the cross-reference +# still resolves; Sphinx just cannot pick the unique target automatically. +suppress_warnings = ["ref.python"] + +# -- Options for Autodoc2 --------------------------------------------------- +sys.path.insert(0, os.path.abspath("..")) + +if not skip_autodoc: + autodoc2_packages = [ + { + "path": "../megatron/core", # Path to your package relative to conf.py + "exclude_dirs": ["converters"], # list of directory names to exclude + } + ] + autodoc2_render_plugin = "myst" # Use MyST for rendering docstrings + autodoc2_output_dir = "apidocs" # Output directory for autodoc2 (relative to docs/) + # This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to + # render google style docstrings. + # Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33 + autodoc2_docstring_parser_regexes = [ + (r".*", "docs.autodoc2_docstrings_parser"), + ] + # Regex patterns whose values contain raw regex syntax (e.g. \p{L}) that docutils + # mis-parses as footnote/reference markup. Exclude them from the generated docs. + autodoc2_hidden_regexes = [ + r".*\._PATTERN_TIKTOKEN.*", + ] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "nvidia_sphinx_theme" +html_theme_options = { + "switcher": { + "json_url": "../versions1.json", + "version_match": release, + }, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/NVIDIA/Megatron-LM/", + "icon": "fa-brands fa-github", + } + ], + "public_docs_features": os.environ.get("SKIP_PUBLIC_DOCS_FEATURES", "false").lower() != "true", +} +html_extra_path = ["project.json", "versions1.json"] + +# Github links are now getting rate limited from the Github Actions +linkcheck_ignore = [ + ".*github\\.com.*", + ".*githubusercontent\\.com.*", + "http://localhost.*", +] + +# PyTorch docs use a JS-rendered frontend; anchor IDs are injected at runtime +# and are not present in the static HTML that linkcheck fetches. +linkcheck_anchors_ignore_for_url = [ + r"https://docs\.pytorch\.org/.*", +] + +# PyTorch docs anchor IDs change between stable versions; verify the page +# loads but skip anchor validation to avoid spurious failures on redirects. +linkcheck_anchors_ignore_for_url = [ + "https://docs.pytorch.org/.*", +] diff --git a/docs/developer/contribute.md b/docs/developer/contribute.md new file mode 100644 index 00000000000..30a39e1cbc0 --- /dev/null +++ b/docs/developer/contribute.md @@ -0,0 +1,70 @@ + + +# Contributing to Megatron-LM + +This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM GitHub repository. + +Everyone is welcome to contribute to the project! We recently migrated from using an internal repo to doing all development directly from the GitHub repository. + +When contributing it is important to ensure that changes are in line with the project direction. Small changes to fix bugs are welcomed and appreciated. **If proposing large architectural changes or changes for stylistic reasons open an issue first so we can discuss it.** + +## Issue policy + +Please do file any bugs you find, keeping the following in mind: + +- If filing a bug, i.e. you have found something that doesn't work as expected, use the BUG template. +- If you've found a regression in speed or accuracy use the REGRESSION template. +- If you are requesting a new feature or modification of an existing feature use the ENHANCEMENT template. +- If opening an issue to ask a question no template is needed but please make your question as clear and concise as possible. +- One issue per bug. Putting multiple things in the same issue makes both discussion and completion unnecessarily complicated. +- Your bug is mostly likely to get attention from the development team quickly if we can easily reproduce it. +- Use proper spelling, grammar, and punctuation. +- Write in an authoritative and technical tone. + +## Code submission policy + +### Do + +- Format new code in a style that is consistent with the file being changed. Megatron-LM doesn't (yet) have a style guide or enforced formatting. +- Split your changes into separate, atomic commits i.e. A commit per feature or fix. +- Make sure your commits are rebased on the master branch. +- Write the commit message subject line in the imperative mood ("Change the default argument for X", not "Changed the default argument for X"). +- Write your commit messages in proper English, with care and punctuation. +- Check the spelling of your code, comments and commit messages. + +### Don't + +- Submit code that's incompatible with the project licence. +- Touch anything outside the stated scope of the PR. This includes formatting changes to code not relevant to the PR. +- Iterate excessively on your design across multiple commits. +- Include commented-out code. +- Attempt large architectural changes without first opening an issue to discuss. + +## Issue and Pull Request Q&A + +### I've submitted an issue and PR. When can I expect to get some feedback? + +You should receive a response within 2 business days. + +### I need help, who should I ping? + +Use [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall). + +### If my issue or PR isn't getting attention, what should I do? + +After 2 business days, tag the user [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall). + +### Is there a policy for issues and PRs that haven't been touched in X days? Should they be closed? + +Yes, we have a bot that will mark untouched PRs as "stale" after 60 days. + +We have a long backlog of issues and PRs dating back years. We are trying to triage these now by working backwards. Older issues we believe may still be relevant may recieve a request to re-test them with the latest code. If there's no response they may be closed. Again, if you they should be re-opened then just respond with a comment to that effect. + +Thank you! \ No newline at end of file diff --git a/docs/developer/generate_docs.md b/docs/developer/generate_docs.md new file mode 100644 index 00000000000..810c630681e --- /dev/null +++ b/docs/developer/generate_docs.md @@ -0,0 +1,22 @@ + + +# Generating Docs Locally + +To generate docs locally, use the following commands: + +``` +cd docs +SKIP_PUBLIC_DOCS_FEATURES=true uv run --only-group docs sphinx-autobuild . _build/html --port 8080 --host 127.0.0.1 +``` + +Docs will be generated at . + +**Recommended:** set the environment variable `SKIP_AUTODOC=true` when generating docs +to skip the generation of `apidocs`. \ No newline at end of file diff --git a/docs/developer/oncall.md b/docs/developer/oncall.md new file mode 100644 index 00000000000..18d76f1436a --- /dev/null +++ b/docs/developer/oncall.md @@ -0,0 +1,59 @@ + +--> + +# Oncall Overview + +During your oncall week, you will be assigned to all PRs marked “Ready for +Review”. From a high-level, your responsibilities include: + +- Review all new PRs +- Accelerate the review process +- Ensure issues and discussion questions are answered + +## PR Responsibilities + +Below is the checklist that the oncall needs to go through for each PR. + +- Should the PR remain a single PR? + - Each PR should have at most 1 expert reviewer, although there will be some outlier cases +- Label PR as “complexity: low”, “complexity: medium”, or “complexity: high” depending on complexity + - Expert reviewers have final say, oncall just sets the initial complexity level + - Initial complexity level guideline + - Low: <100 lines changed + - Medium: 100 < lines changed < 500 + - High: > 500 lines changed +- Does this PR have proper testing coverage? + - If new logic is added, is the new logic tested? +- Should the PR add documentation for any new features? +- Does the PR conform to our style guidelines? + - Code structure + - Cleanliness + - Comments + - File structure +- Do all tests pass? + - Oncall will need to kick off testing suite for external reviewers + - Comment “/ok to test commid_id” to kick off testing suite +- Expert reviewers are notified after the PR is marked “Ready for Review” + - **Expert reviewers should review within 1 business day.** Message the assigned reviewer if it is taking longer. The reviewer either needs to review the PR or suggest an alternate reviewer. + - If the reviewer is not responding after 2 business days, escalate to the reviewer’s manager. +- For `megatron/core` PRs, the “Final Review” label is applied automatically once all expert reviewers approve + - Final reviewers should review within 1 business day. Message the assigned reviewer if it is taking longer. + - If the reviewer is not responding after 2 business days, escalate to the reviewer’s manager. +- The “Approved” label is applied automatically once all required reviewers have approved + +## Issues and Discussion Questions + +If you do not know the answer to an issue or discussion question, that's ok, **Delegate to someone who does.** + +On a daily basis, track the following: + +- [Dashboard for out of SLA issues](https://github.com/NVIDIA/Megatron-LM/issues?q=is%3Aissue%20state%3Aopen%20label%3Awaiting-on-maintainers). + + diff --git a/docs/developer/submit.md b/docs/developer/submit.md new file mode 100644 index 00000000000..205e18cc52f --- /dev/null +++ b/docs/developer/submit.md @@ -0,0 +1,34 @@ + + +# How to Submit a PR + +All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft. + +## Step 1: Mark PR as "Ready for Review" + +1. When your PR is ready, click **Ready for Review**. +2. The oncall reviewer is auto-assigned and expert reviewers are notified based on your changes. They will get notified and pick up your PR soon. + +:warning: Only mark as ready once all merge-conflicts are resolved and the CI is passing. +Final Review might get declined if these requirements are not fulfilled. + +## Step 2: Final Review (`megatron/core` only) + +For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned. + +For PRs outside `megatron/core`, this step is skipped. + +## Step 3: Approved + +Once all required reviewers have approved, the `Approved` label is applied **automatically**. The PR is now ready to merge. + +## Step 4: Merge + +Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR. diff --git a/docs/discussions/README.md b/docs/discussions/README.md new file mode 100644 index 00000000000..aab65fc65ca --- /dev/null +++ b/docs/discussions/README.md @@ -0,0 +1,31 @@ +--- +orphan: true +--- + + + +# Megatron Discussions + +This directory contains in-depth guides, tutorials, and discussions about optimizing and using Megatron for various use cases. + +## Available Guides + +### Training Guides + +## Contributing + +To contribute a guide or tutorial, follow this structure: + +1. Create a new directory: `docs/discussions/your-guide-name/` +2. Add your main guide: `docs/discussions/your-guide-name/your-guide-name.md` +3. Create an images directory: `docs/discussions/your-guide-name/images/` +4. Update this README.md with a link to your guide + +Each guide should be self-contained with its own images and supporting files. \ No newline at end of file diff --git a/docs/distrib_optimizer.md b/docs/distrib_optimizer.md deleted file mode 100644 index def23b20ebe..00000000000 --- a/docs/distrib_optimizer.md +++ /dev/null @@ -1,54 +0,0 @@ -# Distributed Optimizer - -The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following: - -- [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed) -- [no] distribute model gradients -- [no] distribute model parameters - -Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size): - -| | Non-distributed optim | Distributed optim | -| ------ | ------ | ------ | -| float16 param, float16 grads | 20 | 4 + 16/d | -| float16 param, fp32 grads | 18 | 6 + 12/d | -| fp32 param, fp32 grads | 16 | 8 + 8/d | - -The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds: - -1. all model grads -2. a 1/d size _copy_ of the main grads (before copying to the optimizer state) -3. a 1/d size _copy_ of the main params (after copying from the optimizer state) -4. all model params -5. zeros (or None), between iterations - -The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated. - -The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update: - -## Data flow - -![Data flow](images/distrib_optimizer/data_flow.png) - -## Sharding scheme - -![Sharding scheme](images/distrib_optimizer/sharding_scheme.png) - -## Key steps - -_(note: using illustrations above, and assuming fp16 grads)_ - -- Backward pass finishes (grad buffer holds 16 fp16 grad elements) -- Call reduce-scatter on each DP rank -- Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage) -- Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e. - - DP rank 0 copies elements [0:4] - - DP rank 1 copies elements [4:8] - - DP rank 2 copies elements [8:12] - - DP rank 3 copies elements [12:16] -- Optimizer.step() -- Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer -- Call all-gather on each DP rank -- Grad buffer now contains all 16, fully updated, fp16 model param elements -- Copy updated model params from grad buffer into their respective param tensors -- (At this point, grad buffer is ready to be zero'd for the next iteration) diff --git a/docs/documentation.md b/docs/documentation.md new file mode 100644 index 00000000000..d2554157b45 --- /dev/null +++ b/docs/documentation.md @@ -0,0 +1,69 @@ +--- +orphan: true +--- + + + +# Documentation Development + +- [Documentation Development](#documentation-development) + - [Build the Documentation](#build-the-documentation) + - [Live Building](#live-building) + - [Documentation Version](#documentation-version) + + +## Build the Documentation + +The following sections describe how to set up and build the NeMo RL documentation. + +Switch to the documentation source folder and generate HTML output. + +```sh +cd docs/ +uv run --group docs sphinx-build . _build/html +``` + +* The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder. +* The generated python API docs are placed in `apidocs` under the `docs/` folder. + +## Checking for Broken Links + +To check for broken http links in the docs, run this command: + +```sh +cd docs/ +uv run --group docs sphinx-build --builder linkcheck . _build/linkcheck +``` + +It will output a JSON file at `_build/linkcheck/output.json` with links it found while building the +docs. Records will have a status of `broken` if the link is not reachable. The `docs/conf.py` file is +configured to ignore github links because the CI test will often experience rate limit errors. +Comment out the `linkcheck_ignore` variable there to check all the links. + +## Live Building + +When writing documentation, it can be helpful to serve the documentation and have it update live while you edit. + +To do so, run: + +```sh +cd docs/ +uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0 +``` + +Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output. + +## Documentation Version + +The three files below control the version switcher. Before you attempt to publish a new version of the documentation, update these files to match the latest version numbers. + +* docs/versions1.json +* docs/project.json +* docs/conf.py diff --git a/docs/get-started/install.md b/docs/get-started/install.md new file mode 100644 index 00000000000..3e60a1fbb81 --- /dev/null +++ b/docs/get-started/install.md @@ -0,0 +1,123 @@ + + +# Installation + +## System Requirements + +### Hardware + +- **Recommended**: NVIDIA Turing architecture or later +- **FP8 Support**: Requires NVIDIA Hopper, Ada, or Blackwell GPUs + +### Software + +- **Python**: >= 3.10 (3.12 recommended) +- **PyTorch**: >= 2.6.0 +- **CUDA Toolkit**: Latest stable version + + +## Prerequisites + +Install [uv](https://docs.astral.sh/uv/), a fast Python package installer: + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + + +## Option A: Pip Install (Recommended) + +Install the latest stable release from PyPI: + +```bash +uv pip install megatron-core +``` + +To include optional training dependencies (Weights & Biases, SentencePiece, HF Transformers): + +```bash +uv pip install "megatron-core[training]" +``` + +For all extras including [Transformer Engine](https://github.com/NVIDIA/TransformerEngine): + +```bash +uv pip install --group build +uv pip install --no-build-isolation "megatron-core[training,dev]" +``` + +```{note} +`--no-build-isolation` requires build dependencies to be pre-installed in the environment. `torch` is needed because several `[dev]` packages (`mamba-ssm`, `nv-grouped-gemm`, `transformer-engine`) import it at build time to compile CUDA kernels. Expect this step to take **20+ minutes** depending on your hardware. If you prefer pre-built binaries, the [NGC Container](#option-c-ngc-container) ships with these pre-compiled. +``` + +```{warning} +Building from source can consume a large amount of memory. By default the build runs one compiler job per CPU core, which can cause out-of-memory failures on machines with many cores. To limit parallel compilation jobs, set the `MAX_JOBS` environment variable before installing (for example, `MAX_JOBS=4`). +``` + +```{tip} +For a lighter set of development dependencies without Transformer Engine and ModelOpt, use `[lts]` instead of `[dev]`: `uv pip install --no-build-isolation "megatron-core[training,lts]"`. The `[lts]` and `[dev]` extras are mutually exclusive. +``` + +To clone the repository for examples: + +```bash +git clone https://github.com/NVIDIA/Megatron-LM.git +``` + + +## Option B: Install from Source + +For development or to run the latest unreleased code: + +```bash +git clone https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM +uv pip install -e . +``` + +To install with all development dependencies (includes Transformer Engine, requires pre-installed build deps): + +```bash +uv pip install --group build +uv pip install --no-build-isolation -e ".[training,dev]" +``` + +```{tip} +If the build runs out of memory, limit parallel compilation jobs with `MAX_JOBS=4 uv pip install --no-build-isolation -e ".[training,dev]"`. +``` + + +## Option C: NGC Container + +For a pre-configured environment with all dependencies pre-installed (PyTorch, CUDA, cuDNN, NCCL, Transformer Engine), use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch). + +We recommend using the **previous month's** NGC container rather than the latest one to ensure compatibility with the current Megatron Core release and testing matrix. + +```bash +docker run --gpus all -it --rm \ + -v /path/to/dataset:/workspace/dataset \ + -v /path/to/checkpoints:/workspace/checkpoints \ + -e PIP_CONSTRAINT= \ + nvcr.io/nvidia/pytorch:26.01-py3 +``` + +```{note} +The NGC PyTorch container constrains the Python environment globally using `PIP_CONSTRAINT`. The `-e PIP_CONSTRAINT=` flag above unsets this so that Megatron Core and its dependencies install correctly. +``` + +Then install Megatron Core inside the container (torch is already available in the NGC image): + +```bash +pip install uv +uv pip install --no-build-isolation "megatron-core[training,dev]" +``` + + +You are now ready to run training. Refer to [Your First Training Run](quickstart.md) for next steps. diff --git a/docs/get-started/overview.md b/docs/get-started/overview.md new file mode 100644 index 00000000000..5ceddcb1f41 --- /dev/null +++ b/docs/get-started/overview.md @@ -0,0 +1,88 @@ + + +# Overview + +Megatron-Core and Megatron-LM are open-source tools that are typically used together to train LLMs at scale across GPUs. Megatron-Core expands the capability of Megatron-LM. Megatron Bridge connects Megatron-Core and Megatron-LM to other popular training models, such as Hugging Face. + +## Megatron Core + +NVIDIA Megatron Core is a library of essential building blocks for highly efficient large-scale generative AI training. It can be used to train models with high throughput at scale across thousands of GPUs. It provides an extensive set of tools for multimodal and speech AI. It expands Megatron-LM capabilities. + +Megatron-Core contains GPU-optimized techniques featuring advanced parallelism strategies, optimizations like FP8 training, and support for the latest LLM, MoE, and multimodal architectures. It abstracts these techniques into composable and modular APIs. + +Megatron-Core is compatible with all NVIDIA Tensor Core GPUs and popular LLM architectures such as GPT, BERT, T5, and RETRO. + + +**Composable library** with GPU-optimized building blocks for custom training frameworks. + +**Best for:** + +- **Framework developers** building on top of modular and optimized components +- **Research teams** needing custom training loops, optimizers, or data pipelines +- **ML engineers** requiring fault-tolerant training pipelines + +**What you get:** + +- Composable transformer building blocks (attention, MLP) +- Advanced parallelism strategies (TP, PP, DP, EP, CP) +- Pipeline schedules and distributed optimizers +- Mixed precision support (FP16, BF16, FP8) +- GPU-optimized kernels and memory management +- High-performance dataloaders and dataset utilities +- Model architectures (LLaMA, Qwen, GPT, Mixtral, Mamba) + +## Megatron-LM + +Megatron-LM is a reference implementation, with a lightweight large-scale LLM training framework. It offers a customizable native PyTorch training loop with fewer abstraction layers. It was designed for scaling transformer models to the multi-billion and trillion-parameter regimes under realistic memory and compute constraints. **It serves as a direct entry point for exploring Megatron-Core.** + +It uses advanced parallelization techniques including model parallelism (tensor and pipeline), to allow models with billions of parameters to fit and train across large GPU clusters. It enables breakthroughs in large-scale NLP tasks. It splits model computations across many GPUs, overcoming single-GPU memory limits for training huge models, like GPT-style transformers. + +**Reference implementation** that includes Megatron Core plus everything needed to train models. + +**Best for:** + +- **Training large foundation models** at scale with strong performance on the latest NVIDIA hardware +- **Research teams** exploring new architectures and training techniques +- **Learning distributed training** concepts and best practices +- **Quick experimentation** with proven model configurations + +**What you get:** + +- Pre-configured training scripts for GPT, LLaMA, DeepSeek, Qwen, and more. +- End-to-end examples from data prep to evaluation +- Research-focused tools and utilities + + + +## Megatron Bridge + +Megatron Bridge provides out-of-the-box bridges and training recipes for models built on top of base model architectures from Megatron Core. + +Megatron Bridge provides a parallelism-aware pathway to convert models and checkpoints. This bidirectional converter performs on-the-fly, model-parallel-aware, per-parameter conversion, and full in-memory loading. + +After training or modifying a Megatron model, you can convert it again for deployment or sharing. Refer to the [Megatron Bridge repository](https://github.com/NVIDIA-NeMo/Megatron-Bridge) for the code and training recipes. + +## Ecosystem Libraries + +**Libraries used by Megatron Core:** + +- **[Megatron Energon](https://github.com/NVIDIA/Megatron-Energon)** - Multi-modal data loader (text, images, video, audio) with distributed loading and dataset blending +- **[Transformer Engine](https://github.com/NVIDIA/TransformerEngine)** - Optimized kernels and FP8 mixed precision support +- **[Resiliency Extension (NVRx)](https://github.com/NVIDIA/nvidia-resiliency-ext)** - Fault tolerant training with failure detection and recovery + +**Libraries using Megatron Core:** + +- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional checkpoint conversion between Hugging Face and Megatron, customizable training loops, and production-ready recipes +- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods +- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples +- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Check out end-to-end examples in [examples/post_training/modelopt](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt). + +**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed) + diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md new file mode 100644 index 00000000000..9d68016ece7 --- /dev/null +++ b/docs/get-started/quickstart.md @@ -0,0 +1,68 @@ + + +# Your First Training Run + +This guide walks you through running your first training jobs with Megatron Core. Make sure you have completed [installation](install.md) before proceeding. + +## Minimal Training Example + +Run a minimal distributed training loop with mock data on 2 GPUs: + +```bash +torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py +``` + +## LLaMA-3 Training Example + +Train an LLaMA-3 8B model with FP8 precision on 8 GPUs using mock data: + +```bash +./examples/llama/train_llama3_8b_h100_fp8.sh +``` + +## Data Preparation + +To train on your own data, Megatron expects preprocessed binary files (`.bin` and `.idx`). + +### 1. Prepare a JSONL File + +Each line should contain a `text` field: + +```json +{"text": "Your training text here..."} +{"text": "Another training sample..."} +``` + +### 2. Preprocess the Data + +```bash +python tools/preprocess_data.py \ + --input data.jsonl \ + --output-prefix processed_data \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model /path/to/tokenizer.model \ + --workers 8 \ + --append-eod +``` + +### Key Arguments + +- `--input`: Path to input JSON/JSONL file +- `--output-prefix`: Prefix for output binary files (.bin and .idx) +- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, and so on) +- `--tokenizer-model`: Path to tokenizer model file +- `--workers`: Number of parallel workers for processing +- `--append-eod`: Add end-of-document token + +## Next Steps + +- Explore [Parallelism Strategies](../user-guide/parallelism-guide.md) to scale your training +- Learn about [Data Preparation](../user-guide/data-preparation.md) best practices +- Check out [Advanced Features](../user-guide/features/index.md) diff --git a/docs/get-started/releasenotes.md b/docs/get-started/releasenotes.md new file mode 100644 index 00000000000..e624de19f15 --- /dev/null +++ b/docs/get-started/releasenotes.md @@ -0,0 +1,19 @@ + + +# Release Notes + + +## Roadmaps + +Stay up-to-date with our development roadmaps and planned features: + +- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements +- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions + diff --git a/docs/images/context_parallel/CP_overview.png b/docs/images/context_parallel/CP_overview.png new file mode 100644 index 00000000000..38c55b371aa Binary files /dev/null and b/docs/images/context_parallel/CP_overview.png differ diff --git a/docs/images/context_parallel/CP_results.png b/docs/images/context_parallel/CP_results.png new file mode 100644 index 00000000000..e0415ce86eb Binary files /dev/null and b/docs/images/context_parallel/CP_results.png differ diff --git a/docs/images/distrib_optimizer/data_flow.png b/docs/images/distrib_optimizer/data_flow.png index d48fc134c40..01f5cfb2e7e 100644 Binary files a/docs/images/distrib_optimizer/data_flow.png and b/docs/images/distrib_optimizer/data_flow.png differ diff --git a/docs/images/distrib_optimizer/sharding_scheme.png b/docs/images/distrib_optimizer/sharding_scheme.png index b07c25b05f9..e48dd95024a 100644 Binary files a/docs/images/distrib_optimizer/sharding_scheme.png and b/docs/images/distrib_optimizer/sharding_scheme.png differ diff --git a/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png b/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png new file mode 100644 index 00000000000..6c8afa78bb1 Binary files /dev/null and b/docs/images/fine_grained_activation_offloading/offloading_and_recomputing.png differ diff --git a/docs/images/megatron_fsdp/DDP_vs_FSDP.png b/docs/images/megatron_fsdp/DDP_vs_FSDP.png new file mode 100644 index 00000000000..627821439e2 Binary files /dev/null and b/docs/images/megatron_fsdp/DDP_vs_FSDP.png differ diff --git a/docs/images/megatron_fsdp/FSDP_Allreduce.png b/docs/images/megatron_fsdp/FSDP_Allreduce.png new file mode 100644 index 00000000000..66e2391ed04 Binary files /dev/null and b/docs/images/megatron_fsdp/FSDP_Allreduce.png differ diff --git a/docs/images/megatron_fsdp/fsdp_double_buffer.png b/docs/images/megatron_fsdp/fsdp_double_buffer.png new file mode 100644 index 00000000000..fbfbcef9b28 Binary files /dev/null and b/docs/images/megatron_fsdp/fsdp_double_buffer.png differ diff --git a/docs/images/megatron_fsdp/fsdp_streams.png b/docs/images/megatron_fsdp/fsdp_streams.png new file mode 100644 index 00000000000..6b8840783c8 Binary files /dev/null and b/docs/images/megatron_fsdp/fsdp_streams.png differ diff --git a/docs/images/megatron_fsdp/fsdp_v_hfsdp_streams.png b/docs/images/megatron_fsdp/fsdp_v_hfsdp_streams.png new file mode 100644 index 00000000000..6f6e61dfb21 Binary files /dev/null and b/docs/images/megatron_fsdp/fsdp_v_hfsdp_streams.png differ diff --git a/docs/images/megatron_fsdp/hfsdp.png b/docs/images/megatron_fsdp/hfsdp.png new file mode 100644 index 00000000000..3c056d20689 Binary files /dev/null and b/docs/images/megatron_fsdp/hfsdp.png differ diff --git a/docs/images/megatron_fsdp/lcm_dim0_shard.png b/docs/images/megatron_fsdp/lcm_dim0_shard.png new file mode 100644 index 00000000000..910add676f1 Binary files /dev/null and b/docs/images/megatron_fsdp/lcm_dim0_shard.png differ diff --git a/docs/images/megatron_fsdp/mixed_sharding.png b/docs/images/megatron_fsdp/mixed_sharding.png new file mode 100644 index 00000000000..81cbc153f8a Binary files /dev/null and b/docs/images/megatron_fsdp/mixed_sharding.png differ diff --git a/docs/images/megatron_fsdp/quantized_param_gather.png b/docs/images/megatron_fsdp/quantized_param_gather.png new file mode 100644 index 00000000000..e1908e66ad7 Binary files /dev/null and b/docs/images/megatron_fsdp/quantized_param_gather.png differ diff --git a/docs/images/megatron_fsdp/sharded_quantization.png b/docs/images/megatron_fsdp/sharded_quantization.png new file mode 100644 index 00000000000..c65bab5305a Binary files /dev/null and b/docs/images/megatron_fsdp/sharded_quantization.png differ diff --git a/docs/images/megatron_fsdp/uneven_sharding.png b/docs/images/megatron_fsdp/uneven_sharding.png new file mode 100644 index 00000000000..0c34a51b026 Binary files /dev/null and b/docs/images/megatron_fsdp/uneven_sharding.png differ diff --git a/docs/images/megatron_fsdp/zero3_model_state.png b/docs/images/megatron_fsdp/zero3_model_state.png new file mode 100644 index 00000000000..84ad33ff779 Binary files /dev/null and b/docs/images/megatron_fsdp/zero3_model_state.png differ diff --git a/docs/images/moe/token_drop.png b/docs/images/moe/token_drop.png new file mode 100644 index 00000000000..1c335ee7aaf Binary files /dev/null and b/docs/images/moe/token_drop.png differ diff --git a/docs/images/multi_token_prediction/MTP_implementation.png b/docs/images/multi_token_prediction/MTP_implementation.png new file mode 100644 index 00000000000..1f246c3e394 Binary files /dev/null and b/docs/images/multi_token_prediction/MTP_implementation.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000000..11337315588 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,107 @@ + + +# Megatron Core User Guide + +**Megatron Core** is a GPU-optimized library for training large language models at scale. It provides modular, composable building blocks for creating custom training frameworks with state-of-the-art parallelism strategies and performance optimizations. + +Megatron Core offers a flexible, reusable foundation for building large-scale transformer training systems. **Megatron-LM** serves as a reference implementation demonstrating how to use Megatron Core components to train models with billions to trillions of parameters across distributed GPU clusters. + +## Key Features + +* Composable transformer building blocks (attention, MLP) +* Advanced parallelism strategies (TP, PP, DP, EP, CP) +* Pipeline schedules and distributed optimizers +* Mixed precision support (FP16, BF16, FP8) +* GPU-optimized kernels and memory management +* High-performance dataloaders and dataset utilities +* Model architectures (LLaMA, Qwen, DeepSeek, GPT, Mamba) + + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: About Megatron Core + +get-started/overview +get-started/releasenotes +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Get Started + +get-started/install +get-started/quickstart +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Basic Usage + +user-guide/data-preparation +user-guide/training-examples +user-guide/parallelism-guide +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Supported Models + +models/index +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Advanced Features + +user-guide/features/moe +user-guide/features/context_parallel +user-guide/features/megatron_fsdp +user-guide/features/dist_optimizer +user-guide/features/optimizer_cpu_offload +user-guide/features/pipeline_parallel_layout +user-guide/features/fine_grained_activation_offloading +user-guide/data-loading +user-guide/features/megatron_energon +user-guide/features/megatron_rl +user-guide/features/tokenizers +``` + +```{toctree} +:maxdepth: 1 +:hidden: +:caption: Developer Guide + +developer/contribute +developer/submit +developer/oncall +developer/generate_docs +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: API Reference + +api-guide/index +apidocs/index.rst +``` + +```{toctree} +:maxdepth: 2 +:hidden: +:caption: Resources + +advanced/index +``` diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md new file mode 100644 index 00000000000..2754405610c --- /dev/null +++ b/docs/llama_mistral.md @@ -0,0 +1,368 @@ + + +# Llama, Mistral and other Llama-like model support in Megatron-LM + +NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Hugging Face. For converting other models, see [Megatron Bridge](models/index.md). + +The Llama-2 and Llama-3.x family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see ). + +Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results. + +Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatron can support loading checkpoints from all three for inference and finetuning. Converting the checkpoints and loading them is slightly different for each model and is detailed for each below. + +# Contents + +- [Llama, Mistral and other Llama-like model support in Megatron-LM](#llama-mistral-and-other-llama-like-model-support-in-megatron-lm) +- [Contents](#contents) +- [Llama-2](#llama-2) + - [Download Huggingface checkpoints](#download-huggingface-checkpoints) + - [Convert checkpoint format](#convert-checkpoint-format) + - [Huggingface format](#huggingface-format) + - [Launch model](#launch-model) + - [Launch Megatron](#launch-megatron) + - [Launch Huggingface](#launch-huggingface) + - [Benchmark results](#benchmark-results) + - [Big Bench](#big-bench) + - [Multilingual](#multilingual) + - [LM Evaluation Harness](#lm-evaluation-harness) + - [MMLU](#mmlu) +- [Llama-3.x](#llama-3x) + - [Download Huggingface checkpoints](#download-huggingface-checkpoints) + - [Convert checkpoint format](#convert-checkpoint-format) + - [Huggingface format](#huggingface-format) + - [(Optional) Validate checkpoints](#optional-validate-checkpoints) + - [Launch model](#launch-model) +- [Mistral-7b](#mistral-7b) + - [Download Huggingface checkpoints](#download-huggingface-checkpoints) + - [Convert checkpoint format](#convert-checkpoint-format) + - [(Optional) Validate checkpoints](#optional-validate-checkpoints) + - [Launch model](#launch-model) +- [Other Llama-like model support](#other-llama-like-model-support) +- [Known numerical differences](#known-numerical-differences) + +# Llama-2 + +Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps: + +1. Get access to download the checkpoints. +2. Convert the checkpoints from Huggingface format to Megatron format. +3. Setup arguments for launching the model. + +The following sections detail these steps. The final section lists benchmark result comparisons between: 1) Llama-2 inference code running the Meta-format checkpoints, and 2) Megatron inference code running the converted checkpoints. + +## Download Huggingface checkpoints + +Users must first apply for access to download the Llama-2 checkpoints either directly [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in HF's format (available only from HF). HF format can be converted to Megatron, as detailed next. + +## Convert checkpoint format + +We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. + +### Huggingface format + +The HF checkpoints can be converted to Megatron format by using Megatron-Bridge's checkpoint converter for HF format [see script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py). + +``` +python Megatron-Bridge/examples/conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-2-7B \ + --megatron-path ./checkpoints/llama2_7b \ + --torch-dtype bfloat16 \ + --device-map auto +``` + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## Launch model + +### Launch Megatron + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 4096 \ +--max-position-embeddings 4096 \ +--tokenizer-type Llama2Tokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--use-rotary-position-embeddings \ +--normalization RMSNorm \ +--no-position-embedding \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 +``` + +### Launch Huggingface + +Huggingface checkpoints can be launched with: + +## Benchmark results + +The tables below list the benchmark comparisons between native Llama-2 (using Meta's checkpoint and Meta's inference code) and Megatron (using a converted HF checkpoint and Megatron's inference code). + +The values are the percent error between Megatron and Llama-2, calculated using the formula: `| - | / `, where the type of score is detailed before each table. Across all tests (80 total per model size), the mean error is 0.15%. The small difference in benchmark scores between the two models is due to minor arithmetic differences in implementation that alter the numerics slightly. Some of the factors that influence this difference include: + +- Megatron performs batch matrix multiplications in a couple places, such as within self attention and in SwiGLU, that Llama performs separately. +- Megatron uses `torch.baddbmm` within self attention, versus Llama using `torch.matmul`. +- Megatron uses a `sin`/`cos` implementation for rotary position embeddings, versus Llama using a `polar`/`complex` implementation. +- Llama calls `torch.set_default_dtype(torch.float16)` during initialization, which Megatron does not. + +### Big Bench + +Score type: multiple choice grade. + +| bigbench / standard | 7b | 13b | 70b | +| -- | -- | -- | -- | +| date_understanding | 0.29% | 0.13% | 0.12% | +| general_knowledge | 0.00% | 0.00% | 0.00% | +| human_organs_senses | 0.00% | 0.00% | 0.00% | +| intent_recognition | 0.00% | 0.11% | 0.00% | +| riddle_sense | 0.00% | 0.00% | 0.00% | +| similarities_abstraction | 0.00% | 0.58% | 0.00% | +| simple_arithmetic_json_multiple_choice | 0.00% | 0.00% | 0.00% | +| undo_permutation | 0.19% | 0.19% | 0.18% | + +### Multilingual + +Score type: multiple choice grade. + +| multilingual / xcopa | 7b | 13b | 70b | +| -- | -- | -- | -- | +| en-template-mGPT-remove-punctuation | 0.08% | 0.00% | 0.00% | +| et-template-mGPT-remove-punctuation | 0.00% | 0.13% | 0.25% | +| ht-template-mGPT-remove-punctuation | 0.26% | 0.13% | 0.26% | +| id-template-mGPT-remove-punctuation | 0.11% | 0.00% | 0.19% | +| it-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% | +| qu-template-mGPT-remove-punctuation | 0.00% | 0.00% | 0.27% | +| sw-template-mGPT-remove-punctuation | 0.14% | 0.13% | 0.13% | +| th-template-mGPT-remove-punctuation | 0.25% | 0.13% | 0.13% | +| tr-template-mGPT-remove-punctuation | 0.26% | 0.00% | 0.34% | +| vi-template-mGPT-remove-punctuation | 0.00% | 0.11% | 0.00% | +| zh-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% | + +### LM Evaluation Harness + +Score type: multiple choice grade. + +| lm-eval | 7b | 13b | 70b | +| -- | -- | -- | -- | +| boolq | 0.04% | 0.04% | 0.07% | +| hellaswag | 0.02% | 0.03% | 0.03% | +| piqa | 0.00% | 0.00% | 0.07% | +| winogrande | 0.00% | 0.11% | 0.20% | + +### MMLU + +Score type: multiple choice grade. + +Note: the number in brackets is the number of sub-tasks for each supercategory. + +| mmlu | 7b | 13b | 70b | +| -- | -- | -- | -- | +| stem [18] | 0.79% | 0.05% | 0.01% | +| humanities [13] | 0.19% | 0.01% | 0.02% | +| other (business, health, misc.) [14] | 0.08% | 0.06% | 0.12% | +| social sciences [12] | 0.37% | 0.21% | 0.01% | + +# Llama-3.x + +Llama-3.x checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps: + +1. Get access to download the checkpoints (weights and tokenizer). +2. Convert the checkpoints from Huggingface format to Megatron format. +3. (Optional) Validate converted checkpoints +4. Setup arguments for launching the model. + +The following sections detail these steps. + +## Download Huggingface checkpoints + +Users must first apply for access to download the Llama-3.x checkpoints from [Huggingface](https://huggingface.co/meta-llama). + +## Convert checkpoint format + +We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. + +### Huggingface format + +The HF checkpoints can be converted to Megatron format by using Megatron-Bridge's checkpoint converter for HF format [see script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py). + +``` +python Megatron-Bridge/examples/conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama3_2_1b \ + --torch-dtype bfloat16 \ + --device-map auto +``` + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## (Optional) Validate checkpoints + +A Megatron-LM text generation server for Llama3 can be launched using the script `examples/inference/llama_mistral/run_text_generation_llama3.sh `. For Llama3.1, please use `examples/inference/llama_mistral/run_text_generation_llama3.1.sh`. + +Once running, query the server with `curl 'http://:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":[""], "tokens_to_generate":100, "top_k":1}'`. + +A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path --prompt `. + +## Launch model + +If loading for either inference or finetuning, use the following arguments for Llama 3.0: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 8192 \ +--max-position-embeddings 8192 \ +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 \ +--disable-bias-linear \ +--transformer-impl transformer_engine \ +--group-query-attention 8 \ +--attention-dropout 0.0 \ +--hidden-dropout 0.0 \ +--rotary-base 500000 \ +--rotary-percent 1.0 \ +--ffn-hidden-size 14336 \ +--num-attention-heads 32 \ +--swiglu \ +--bf16 \ +``` + +For Llama3.1 please use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 8192 \ +--max-position-embeddings 131072 \ +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 \ +--disable-bias-linear \ +--transformer-impl transformer_engine \ +--group-query-attention 8 \ +--attention-dropout 0.0 \ +--hidden-dropout 0.0 \ +--rotary-base 500000 \ +--rotary-percent 1.0 \ +--use-rope-scaling \ +--ffn-hidden-size 14336 \ +--num-attention-heads 32 \ +--swiglu \ +--bf16 \ +``` + +# Mistral-7b + +Megatron currently supports loading the v0.3 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps: + +1. Get access to download the checkpoints (weights and tokenizer). +2. Convert the checkpoints from HuggingFace format to Megatron format. +3. (Optional) Validate converted checkpoints +4. Setup arguments for launching the model. + +The following sections detail these steps. + +## Download Huggingface checkpoints + +Users must first apply for access to download the Mistral-7b checkpoints through Huggingface. Two variants are available: the base model ([Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3)) and the instruct model ([Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)). + +## Convert checkpoint format + +The HF checkpoints can be converted to Megatron format by using Megatron-Bridge's checkpoint converter for HF format [see script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py). + +``` +python Megatron-Bridge/examples/conversion/convert_checkpoints.py import \ + --hf-model mistralai/Mistral-7B-Instruct-v0.3 \ + --megatron-path ./checkpoints/mistral_7b \ + --torch-dtype bfloat16 \ + --device-map auto +``` + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## (Optional) Validate checkpoints + +A Megatron-LM text generation server for Mistral-7B can be launched using the script `examples/inference/llama_mistral/run_text_generation_mistral.sh `. + +Once running, query the server with `curl 'http://:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":[""], "tokens_to_generate":100, "top_k":1}'`. + +A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/inference/llama_mistral/huggingface_reference.py --model_path --prompt `. + +## Launch model + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 4096 \ +--max-position-embeddings 4096 \ +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 +--apply-layernorm-1p \ +--transformer-impl transformer_engine \ +--group-query-attention 8 \ +--disable-bia-linear \ +--rotary-base 1000000 \ +--rotary-percent 1.0 \ +--swiglu \ +--ffn-hidden-size 14336 \ +--num-attention-heads 32 +``` + +# Other Llama-like model support + +*Note: Experimental* + +Many models such as Yi-34B and Qwen2.x use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama-3.x](#llama-3x). + +# Known numerical differences + +It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list: + +1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: +2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas Megatron core combines them into a single GEMM for efficiency. This leads to small numerical differences. diff --git a/docs/models/index.md b/docs/models/index.md new file mode 100644 index 00000000000..0ee379b01bd --- /dev/null +++ b/docs/models/index.md @@ -0,0 +1,26 @@ + + +# Supported Models + +Megatron Core supports a wide range of language and multimodal models with optimized implementations for large-scale training. + +## Model Conversion + +For converting HuggingFace models to Megatron format, use [Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge), the official standalone converter. Megatron Bridge supports an extensive list of models including LLaMA, Mistral, Mixtral, Qwen, DeepSeek, Gemma, Phi, Nemotron, and many more. + +See the [Megatron Bridge supported models list](https://github.com/NVIDIA-NeMo/Megatron-Bridge?tab=readme-ov-file#supported-models) for the complete and up-to-date list of supported models. + +```{toctree} +:maxdepth: 1 + +llms +multimodal +../llama_mistral +``` diff --git a/docs/models/llms.md b/docs/models/llms.md new file mode 100644 index 00000000000..f649673a2cc --- /dev/null +++ b/docs/models/llms.md @@ -0,0 +1,59 @@ + + +# Language Models + +Megatron Core supports the following language model architectures for large-scale training. + +## Converting HuggingFace Models + +Use [**Megatron Bridge**](https://github.com/NVIDIA-NeMo/Megatron-Bridge) to convert HuggingFace models to Megatron format. Megatron Bridge is the official standalone converter with support for an extensive list of models including LLaMA, Mistral, Mixtral, Qwen, DeepSeek, Gemma, Phi, Nemotron, and many more. + +See the [Megatron Bridge supported models list](https://github.com/NVIDIA-NeMo/Megatron-Bridge?tab=readme-ov-file#supported-models) for the complete and up-to-date list. + +## Decoder-Only Models + +| Model | Description | Key Features | +|-------|-------------|--------------| +| **GPT** | Generative Pre-trained Transformer | Standard autoregressive LM, foundational architecture | +| **LLaMA** | Meta's LLaMA family | Efficient architecture with RoPE, SwiGLU, RMSNorm | +| **Mistral** | Mistral AI models | Sliding window attention, efficient inference | +| **Mixtral** | Sparse Mixture-of-Experts | 8x7B MoE architecture for efficient scaling | +| **Qwen** | Alibaba's Qwen series | HuggingFace integration, multilingual support | +| **Mamba** | State Space Model | Subquadratic sequence length scaling, efficient long context | + +## Encoder-Only Models + +| Model | Description | Key Features | +|-------|-------------|--------------| +| **BERT** | Bidirectional Encoder Representations | Masked language modeling, classification tasks | + +## Encoder-Decoder Models + +| Model | Description | Key Features | +|-------|-------------|--------------| +| **T5** | Text-to-Text Transfer Transformer | Unified text-to-text framework, sequence-to-sequence | + +## Example Scripts + +Training examples for these models can be found in the `examples/` directory: +- `examples/gpt3/` - GPT-3 training scripts +- `examples/llama/` - LLaMA training scripts +- `examples/mixtral/` - Mixtral MoE training +- `examples/mamba/` - Mamba training scripts +- `examples/bert/` - BERT training scripts +- `examples/t5/` - T5 training scripts + +## Model Implementation + +All language models are built using Megatron Core's composable transformer blocks, enabling: +- Flexible parallelism strategies (TP, PP, DP, EP, CP) +- Mixed precision training (FP16, BF16, FP8) +- Distributed checkpointing +- Efficient memory management diff --git a/docs/models/multimodal.md b/docs/models/multimodal.md new file mode 100644 index 00000000000..07ff76d8d9a --- /dev/null +++ b/docs/models/multimodal.md @@ -0,0 +1,71 @@ + + +# Multimodal Models + +Megatron Core supports multimodal models that combine language with vision, audio, and other modalities for comprehensive multimodal understanding. + +## MIMO: Multimodal In/Out Framework + +**MIMO (Multimodal In/Out Model)** is an experimental framework in Megatron Core that supports arbitrary combinations of modalities including vision, audio, and text. MIMO provides a flexible architecture for building custom multimodal models. + +> **Note**: MIMO is experimental and under active development. The API may change in future releases. + +**Key Features:** +- Arbitrary modality combinations (vision, audio, text) +- Flexible encoder architecture for different input modalities +- Unified embedding space across modalities +- Support for both vision-language and audio-vision-language models + +See [examples/mimo](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mimo) for training scripts and examples. + +## Vision-Language Models + +| Model | Description | Vision Encoder | Language Model | +|-------|-------------|----------------|----------------| +| **LLaVA** | Visual instruction tuning | CLIP ViT-L/14 | Mistral-7B / LLaMA | +| **NVLM** | NVIDIA Vision-Language Model | CLIP / Custom ViT | LLaMA-based | +| **LLaMA 3.1 Nemotron Nano VL** | Efficient multimodal model | Vision Transformer | LLaMA 3.1 8B | + +## Vision Encoders + +| Model | Description | Key Features | +|-------|-------------|--------------| +| **CLIP ViT** | OpenAI's CLIP Vision Transformer | Image-text alignment, multiple scales (L/14@336px) | +| **RADIO** | Resolution-Agnostic Dynamic Image Optimization | Flexible resolution handling, efficient vision encoding | + +## Diffusion Models + +For multimodal diffusion models (image generation, text-to-image). Refer to [Nvidia Diffusion Models](https://github.com/NVIDIA-NeMo/DFM/ ). The Developer Program, NIM, and NeMo can offer production-ready implementations of: + +- Stable Diffusion variants +- Text-to-image generation +- Image-to-image translation +- ControlNet and other conditioning mechanisms + +## Multimodal Features + +- **Image-Text Alignment**: Pre-training on image-caption pairs +- **Visual Instruction Tuning**: Fine-tuning on instruction-following datasets +- **Flexible Vision Encoders**: Support for different ViT architectures and resolutions +- **Combined Checkpointing**: Unified checkpoints combining vision and language models +- **Efficient Training**: Full parallelism support (TP, PP, DP) for both vision and language components + +## Example Scripts + +Multimodal training examples can be found in the following directories: + +**MIMO Framework:** +- `examples/mimo/` - Multimodal In/Out training with support for vision-language and audio-vision-language models + +**Specific Multimodal Models:** +- `examples/multimodal/` - LLaVA-style training with Mistral + CLIP +- `examples/multimodal/nvlm/` - NVLM training scripts +- `examples/multimodal/llama_3p1_nemotron_nano_vl_8b_v1/` - Nemotron VL training +- `examples/multimodal/radio/` - RADIO vision encoder integration diff --git a/docs/project.json b/docs/project.json new file mode 100644 index 00000000000..d5b9535338b --- /dev/null +++ b/docs/project.json @@ -0,0 +1,2 @@ +{"name": "megatron-lm", "version": "nightly"} + diff --git a/docs/user-guide/data-loading.md b/docs/user-guide/data-loading.md new file mode 100644 index 00000000000..b60cd685cf2 --- /dev/null +++ b/docs/user-guide/data-loading.md @@ -0,0 +1,152 @@ + + +# Data Loading at Scale + +This guide covers how Megatron's data pipeline works and how to configure it for efficient training at 256 nodes and beyond. At this scale, the primary bottlenecks are **index building** and **barrier synchronization** -- not raw data bandwidth. + +## How Data Loading Works + +Understanding the architecture helps explain why specific flags matter. + +Megatron builds three index arrays for each dataset: a **document index** (shuffled document order), a **sample index** (mapping samples to document offsets), and a **shuffle index** (final sample permutation). This happens once during initialization: + +1. **Rank 0** builds all three indices and writes them to a cache directory as `.npy` files. +2. All ranks synchronize at a `torch.distributed.barrier()`. +3. **All other ranks** load the cached indices via memory-mapped reads (`numpy.load(mmap_mode='r')`). + +After initialization, data access is **read-only and lock-free**. Each data-parallel rank consumes a disjoint subset of samples, and no cross-rank coordination is needed during training because all ranks derive the same deterministic permutation from a shared random seed. + +## The Problem at 256+ Nodes + +Three things break down at large node counts: + +1. **Barrier synchronization**: All ranks block while rank 0 builds indices. On a 512-node job, this means 4,095 GPUs sit idle. +2. **Simultaneous memory-mapping**: All ranks `mmap` three large `.npy` files at once after the barrier, causing a burst of page faults and I/O. + +## Baseline: Establish Maximum Achievable Performance + +Before tuning data loading, establish a performance ceiling by running with `--mock-data`. This bypasses the data pipeline entirely and shows the maximum throughput your configuration can achieve without any dataloader overhead. The gap between `--mock-data` performance and real-data performance tells you exactly how much time the dataloader is costing you. + +## Recommended Configuration + +### Step 1: Consolidate dataset files + +A common issue at scale is having datasets split across many small file prefixes. Thousands of 100 MB files perform significantly worse than tens of 10 GB+ files, both for building dataset caches and for runtime file access. + +Use the merge tool to consolidate datasets stored as many small prefixes in one directory: + +```bash +python tools/merge_datasets.py \ + --input /path/to/input-directory \ + --output-prefix /path/to/output/merged +``` + +**Target at least 10 GB per file.** This reduces the number of file descriptors, metadata lookups, and index-building work at initialization. + +### Step 2: Pre-build the dataset cache + +Build the GPT dataset cache as a separate step before training. This avoids the usual "rank 0 builds, everyone else waits" startup path and is the recommended workflow for large jobs: + +```bash +python tools/prepare_cache.py \ + --data-path \ + --split 99,1,0 \ + --data-cache-path /path/to/cache \ + --global-batch-size \ + --seq-length \ + ... +``` + +If your later training job does not set `--global-batch-size`, or you are preparing the cache on a machine that does not match the future training topology, also pass: + +```bash +--prepare-cache-world-size +``` + +This keeps the prepared cache aligned with the sample counts expected by training. + +> **Unsupported configurations:** `tools/prepare_cache.py` does not support `--mock-data`, `--sft`, `--fim-data`, or `--step-batch-size-schedule`. Using any of these will cause the script to exit with an error. + +### Step 3: Optionally pre-build per-dataset metadata + +When blending many datasets, generate the `--per-dataset-sequences-path` JSON ahead of time to avoid one metadata read per file prefix at startup: + +```bash +python tools/build_sequences_per_dataset.py \ + --data-path \ + --per-dataset-sequences-path sequences.json +``` + +### Step 4: Launch training with optimized data loading + +Once the cache is ready, enable the fast-path flags: + +```bash +torchrun --nproc_per_node=8 --nnodes=512 ... pretrain_gpt.py \ + --dataloader-fast-cache-load \ + --dataloader-defer-npy-index-mmap \ + --per-dataset-sequences-path sequences.json \ + --data-cache-path /path/to/cache \ + --num-workers 2 \ + ... +``` + +### Flag reference + +| Flag | Default | Recommendation | What it does | +|------|---------|----------------|-------------| +| `--dataloader-fast-cache-load` | off | **On** | Skips the rank-0 barrier by assuming the cache already exists. All ranks build their dataset views in parallel. This is the single biggest win at scale. | +| `--dataloader-defer-npy-index-mmap` | off | **On** | Defers memory-mapping of `.npy` index files until first access. When combined with `--num-workers > 0`, index loading is overlapped with the training iteration rather than blocking startup. | +| `--per-dataset-sequences-path` | None | **Set when blending many datasets** | Points to a JSON file mapping each dataset path to its `(sequence_count, document_count)`. Replaces per-file metadata reads with a single JSON lookup. Generate with `tools/build_sequences_per_dataset.py`. | +| `--data-cache-path` | None | **Set** | Directory where index `.npy` files are cached. Must be on shared storage for multi-node jobs so all ranks can read it. | +| `--num-workers` | 2 | **Keep as small as necessary** | Number of DataLoader worker processes. The goal is to satisfy: *time to process a batch > time to prepare a batch*. This hides dataloader work behind the training step. Increasing beyond what's needed wastes CPU and memory. | +| `--no-mmap-bin-files` | mmap on | **Test both** | Memory-mapping `.bin` files leverages the OS page cache, but the optimal setting is filesystem-dependent. Some large-scale production configurations disable mmap. Test with and without to determine what works best for your storage. | + +### Object storage (S3 / Multi-Storage Client) + +When data lives on S3 or MSC rather than a POSIX filesystem: + +- **Index files** (`.idx`) are cached locally under `object_storage_cache_path`. +- **Binary data files** (`.bin`) are streamed on-demand in 256 MB chunks, avoiding the need to download entire files. +- Set `--no-mmap-bin-files` since memory-mapping doesn't apply to object storage. +- Ensure the index-cache path is visible wherever the later dataset construction will run. + +## Scaling Characteristics + +| Aspect | Behavior | Why it works | +|--------|----------|-------------| +| **Cross-rank contention** | None after init | All index files are read-only; `numpy.memmap` uses OS page cache with no locking | +| **Sampling determinism** | All ranks produce the same permutation | Shared `numpy.random.RandomState(seed)` with epoch-based seed variation | +| **Data-parallel sharding** | Each DP rank gets a disjoint subset of samples | No overlap during training; assignment happens in the sampler rather than via extra dataset coordination | +| **Index broadcast** | Via shared filesystem, not collectives | Rank 0 writes `.npy` files; other ranks read them. No explicit `torch.distributed.broadcast` | + +## Troubleshooting + +**Symptom: Training hangs at startup for minutes** +- Likely cause: Rank 0 is building indices while all other ranks wait at the barrier. +- Fix: Pre-build the cache with `tools/prepare_cache.py` and enable `--dataloader-fast-cache-load`. + +**Symptom: Spike in I/O at training start, then normal** +- Likely cause: All ranks simultaneously memory-mapping index files after the barrier. +- Fix: Enable `--dataloader-defer-npy-index-mmap` to overlap index loading with training. + +**Symptom: Slow data loading during training (not just startup)** +- Run with `--mock-data` to confirm the dataloader is the bottleneck. +- If startup, not steady-state throughput, is the main issue, try `--dataloader-defer-npy-index-mmap`. +- If you are blending many dataset prefixes, try `--per-dataset-sequences-path`. +- Test with `--no-mmap-bin-files` -- the optimal setting depends on your filesystem. + +## Related Resources + +- [PR #2445](https://github.com/NVIDIA/Megatron-LM/pull/2445): Original implementation of fast cache load, deferred mmap, and per-dataset sequences optimizations. +- [PR #4080](https://github.com/NVIDIA/Megatron-LM/pull/4080): Adds `tools/prepare_cache.py` for offline GPT dataset cache preparation. +- [`tools/prepare_cache.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/prepare_cache.py): Pre-build GPT dataset caches ahead of training. +- [`tools/merge_datasets.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/merge_datasets.py): Merge multiple small dataset files into larger ones. +- [`tools/build_sequences_per_dataset.py`](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/build_sequences_per_dataset.py): Generate the `--per-dataset-sequences-path` JSON file. diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md new file mode 100644 index 00000000000..813f81501e7 --- /dev/null +++ b/docs/user-guide/data-preparation.md @@ -0,0 +1,126 @@ + + +# Data Preparation + +Preparing your data correctly is essential for successful training with Megatron Core. + +## Data Format + +Megatron Core expects training data in JSONL (JSON Lines) format, where each line is a JSON object: + +```json +{"text": "Your training text here..."} +{"text": "Another training sample..."} +{"text": "More training data..."} +``` + +## Preprocessing Data + +Use the `preprocess_data.py` tool to convert your JSONL data into Megatron's binary format: + +```bash +python tools/preprocess_data.py \ + --input data.jsonl \ + --output-prefix processed_data \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model /path/to/tokenizer.model \ + --workers 8 \ + --append-eod +``` + +### Key Arguments + +The following table summarizes the main preprocessor arguments: + +| Argument | Description | +|----------|-------------| +| `--input` | Path to input JSON/JSONL file | +| `--output-prefix` | Prefix for output binary files (.bin and .idx) | +| `--tokenizer-type` | Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, and so on) | +| `--tokenizer-model` | Path to tokenizer model file | +| `--workers` | Number of parallel workers for processing | +| `--append-eod` | Add end-of-document token | + +## Finding Optimal Number of Workers + +Use the `--find-optimal-num-workers` flag to find the number of workers that gives the best performance in terms of preprocessed documents per second. +The script launches a few short data preprocessing runs with different worker counts and identifies the fastest run using the collected performance data. + +```bash +python tools/preprocess_data.py \ + --input data.jsonl \ + --output-prefix processed_data \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model /path/to/tokenizer.model \ + --workers 8 \ + --find-optimal-num-workers \ + --workers-to-check 4 8 16 32 \ + --max-documents 50000 +``` + +**Required arguments** + +The following table lists the arguments required for worker optimization: + +| Argument | Description | +|----------|-------------| +| `--find-optimal-num-workers` | Activates search of optimal number of workers | +| `--workers-to-check` | List of possible number of workers to run | +| `--max-documents` | Number of documents to be preprocessed during each run | + +**Output example** + +The command prints performance results similar to the following: + +```bash +----------------------------------- +Performance results (fastest → slowest): +1. 16 workers → avg. docs/s: 9606.6476 +2. 32 workers → avg. docs/s: 9275.3284 +3. 8 workers → avg. docs/s: 9151.9280 +4. 4 workers → avg. docs/s: 6391.3819 + +----------------------------------- +The most optimal num of workers is 16 with avg. preprocessed docs/s: 9606.6476. +----------------------------------- +``` + +## Output Files + +The preprocessing tool generates two files: + +- `processed_data.bin` - Binary file containing tokenized sequences +- `processed_data.idx` - Index file for fast random access + +## Using Preprocessed Data + +Reference your preprocessed data in training scripts: + +```bash +--data-path processed_data \ +--split 949,50,1 # Train/validation/test split +``` + +## Common Tokenizers + +### HuggingFace Tokenizers + +```bash +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model /path/to/tokenizer.model +``` + +### GPT-2 BPE Tokenizer + +```bash +--tokenizer-type GPT2BPETokenizer \ +--vocab-file gpt2-vocab.json \ +--merge-file gpt2-merges.txt +``` diff --git a/docs/user-guide/features/context_parallel.md b/docs/user-guide/features/context_parallel.md new file mode 100644 index 00000000000..890609ac7de --- /dev/null +++ b/docs/user-guide/features/context_parallel.md @@ -0,0 +1,43 @@ + + +# Context Parallel Package + +## Context Parallelism Overview + +```{figure} ../../images/context_parallel/CP_overview.png +:alt: Diagram of a transformer layer with tensor parallelism 2 and context parallelism 2, showing CP and TP communication patterns around attention and other blocks. +:align: center + +Figure 1: A transformer layer running with TP2CP2. Communications next to Attention are for CP, others are for TP. (AG/RS: all-gather in forward and reduce-scatter in backward, RS/AG: reduce-scatter in forward and all-gather in backward, /AG: no-op in forward and all-gather in backward). +``` + +Context Parallelism (CP) is a parallelization scheme on the sequence-length dimension. Unlike prior SP (sequence parallelism), which only splits the sequence of Dropout and LayerNorm activations, CP partitions the network inputs and all activations along the sequence dimension. With CP, all modules except attention (for example, Linear and LayerNorm) can work as usual without any changes, because they do not have inter-token operations. For attention, the Q (query) of each token must combine with the KV (key and value) of all tokens in the same sequence. CP therefore requires an additional all-gather across GPUs to collect the full sequence of KV. Correspondingly, reduce-scatter is applied to the activation gradients of KV in backward propagation. To reduce activation memory footprint, each GPU stores only the KV of a sequence chunk in forward and gathers KV again in backward. KV communication happens between a GPU and its counterparts in other TP groups. The all-gather and reduce-scatter are implemented as point-to-point communications in a ring topology. Exchanging KV can also leverage MQA or GQA to reduce communication volume, because those variants use one or a few attention heads for KV. + +For example, in Figure 1, if the sequence length is 8K, each GPU processes 4K tokens. GPU0 and GPU2 form a CP group and exchange KV with each other; the same pattern applies between GPU1 and GPU3. CP is similar to [Ring Attention](https://arxiv.org/abs/2310.01889) but targets higher performance by (1) using current open-source and cuDNN flash attention kernels, and (2) avoiding extra work from lower-triangle causal masking while keeping load balanced across GPUs. + +## Context Parallelism Benefits + +```{figure} ../../images/context_parallel/CP_results.png +:alt: Chart of speedup for 175B GPT with different tensor parallelism and context parallelism combinations compared with full activation recomputation. +:align: center + +Figure 2: Speedup of 175B GPT with various TP+CP combinations compared to full recomputation (that is, TP8CP1). +``` + +An LLM can hit an out-of-memory (OOM) error on long contexts (long sequence lengths) because activation memory grows about linearly with sequence length. Recomputing activations in backward can avoid OOM but adds significant overhead (about 30 percent with full recomputation). Increasing TP (tensor model parallelism) can also fix OOM, but it can make compute in layers such as Linear too short to hide communication latency. Scaling to more GPUs with larger TP can hit that overlap limit even when OOM is not the driver. + +CP addresses these tradeoffs. With CP, each GPU computes on part of the sequence, which scales down both compute and communication by the CP degree. Overlap between them is less of a concern. The activation memory footprint per GPU is also smaller by the CP degree, which reduces OOM risk. As Figure 2 shows, TP and CP together can outperform full recomputation by removing most recompute overhead and balancing compute against communication. + +## Enabling Context Parallelism + +CP support is included on the GPT code path. Other models that share that path, such as LLaMA, can use CP as well. CP works with TP (tensor model parallelism), PP (pipeline model parallelism), and DP (data parallelism). The total GPU count is TP × CP × PP × DP. CP also works with different attention variants, including MHA, MQA, and GQA, with unidirectional or bidirectional masking. + +Enable CP by setting `context_parallel_size=` on the command line. The default `context_parallel_size` is 1, which disables CP. Running with CP requires Megatron Core (>=0.5.0) and Transformer Engine (>=1.1). + diff --git a/docs/user-guide/features/dist_optimizer.md b/docs/user-guide/features/dist_optimizer.md new file mode 100644 index 00000000000..bfea3b63a66 --- /dev/null +++ b/docs/user-guide/features/dist_optimizer.md @@ -0,0 +1,49 @@ + + +# Distributed Optimizer + +The distributed optimizer saves memory by sharding optimizer state across data parallel ranks instead of replicating it on every rank, as described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). + +Theoretical memory savings depend on the data types of the model parameters (`param_dtype`) and of the main gradients accumulated across data-parallel replicas (`grad_dtype`). Optimizer steps always use `fp32` main parameters. In the current implementation, the theoretical number of bytes per parameter is as follows (where *d* is the data parallel size): + +| | Non-distributed optim | Distributed optim | +| ------ | ------ | ------ | +| `fp16` parameters, `fp16` gradients | 20 | 4 + 16/d | +| `bf16` parameters, `fp32` gradients | 18 | 6 + 12/d | +| `fp32` parameters, `fp32` gradients | 16 | 8 + 8/d | + +This distributed optimizer uses contiguous buffers for parameters and main gradients. Model gradients copy into the main gradients as soon as they finish computing. + +The following figures show the sharding scheme and the main steps of the parameter update. + +## Data Flow + +![Diagram of gradient and parameter data flow through reduce-scatter, optimizer step, and all-gather across data parallel ranks](../../images/distrib_optimizer/data_flow.png) + +## Sharding Scheme + +![Diagram of how optimizer state shards across data parallel ranks](../../images/distrib_optimizer/sharding_scheme.png) + +## Key Steps + +**Note:** The following steps match the illustrations above. They assume `bf16` model weights, `bf16` model gradients from the backward pass, and `fp32` main gradients for optimizer steps. Optimizer steps use `fp32` main weights. + +- Backward pass finishes (gradient buffer holds 16 `fp32` gradient elements). +- Call reduce-scatter on each DP rank. +- Each DP rank now has four elements within the gradient buffer that are fully reduced (remaining 12 elements are garbage). + - DP rank 0 has gradient values for elements [0:4]. + - DP rank 1 has gradient values for elements [4:8]. + - DP rank 2 has gradient values for elements [8:12]. + - DP rank 3 has gradient values for elements [12:16]. +- Optimizer.step(). +- Each DP rank copies its four `fp32` main parameter elements into the corresponding `bf16` parameter buffer (each element is cast from `fp32` to `bf16`). +- Call all-gather on each DP rank. +- The parameter buffer now contains all 16 updated `bf16` model parameter elements. Parameters in PyTorch modules already point to the correct views in this buffer, so forward passes can start after the all-gather completes. +- At this point, you can zero the gradient buffer for the next iteration. diff --git a/docs/user-guide/features/fine_grained_activation_offloading.md b/docs/user-guide/features/fine_grained_activation_offloading.md new file mode 100644 index 00000000000..7b97c17cef9 --- /dev/null +++ b/docs/user-guide/features/fine_grained_activation_offloading.md @@ -0,0 +1,46 @@ + + +# Fine-Grained Activation Offloading + +Contributed in collaboration with RedNote. + +Memory is often the limiting factor for very large sparse MoE models such as DeepSeek-V3 and Qwen3-235B. Fine-grained recomputation lowers activation memory at the cost of extra compute. Offloading can use host-device bandwidth so that reload overlaps compute and keeps overhead small in many setups. Fine-grained activation offloading moves activations at module granularity so you can tune how much activation memory leaves the device and adjust training throughput. + +Supported offloading modules are `"attn_norm"`, `"core_attn"`, `"attn_proj"`, `"mlp_norm"`, `"expert_fc1"`, and `"moe_act"`. They can be combined with fine-grained recomputation to free almost all activations for a transformer layer on the device. + +## Features + +- Pipeline parallelism: PP=1, PP, and interleaved PP +- Compatible with fine-grained recomputation +- FP8 training +- MTP +- Mixed dense and MoE layers +- A2A overlap +- CUDA graphs + - **Note:** A CUDA graph capture cannot include the offloading modules (temporary limitation). + +## Usage + +```bash +# Enable fine-grained activation offloading +--fine-grained-activation-offloading + +# Modules whose inputs are offloaded (refer to your training script for list or delimiter syntax). +# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act". +--offload-modules expert_fc1 +``` + +## Compatible With Fine-Grained Recomputation + +- For low-overhead modules such as LayerNorm or `moe_act`, use recomputation to save activation memory. +- For other modules, use offloading to save activation memory. +- Overlap offload and reload with compute when possible. + +![Diagram comparing fine-grained activation offloading and fine-grained recomputation across a transformer layer](../../images/fine_grained_activation_offloading/offloading_and_recomputing.png) diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md new file mode 100644 index 00000000000..9dea6bd34a4 --- /dev/null +++ b/docs/user-guide/features/index.md @@ -0,0 +1,27 @@ + + +# Advanced Features + +Guides for Megatron Core training features. + +```{toctree} +:maxdepth: 2 + +fine_grained_activation_offloading +moe +context_parallel +megatron_fsdp +dist_optimizer +optimizer_cpu_offload +pipeline_parallel_layout +tokenizers +megatron_energon +megatron_rl +``` diff --git a/docs/user-guide/features/megatron_energon.md b/docs/user-guide/features/megatron_energon.md new file mode 100644 index 00000000000..c32b2d8facd --- /dev/null +++ b/docs/user-guide/features/megatron_energon.md @@ -0,0 +1,143 @@ + + +# Megatron Energon + +Multimodal dataloader for text, images, video, and audio at scale. + +## Overview + +[**Megatron Energon**](https://github.com/NVIDIA/Megatron-Energon) supports large-scale multimodal training with: + +- **Multimodal support** - Text, images, video, and audio +- **Distributed loading** - Suited to multi-node training +- **Data blending** - Mix datasets with configurable weights +- **WebDataset format** - Streaming from cloud storage +- **State management** - Save and restore training position + +## Installation + +```bash +pip install megatron-energon +``` + +## Key Features + +### Data Processing + +- **Packing** - Packs samples to use sequence length capacity +- **Grouping** - Batching of similar-length sequences +- **Joining** - Combine multiple dataset sources +- **Object storage** - Stream from S3, GCS, and Azure Blob Storage + +### Production Use + +- Distributed loading across workers and nodes +- Checkpoint data loading state +- Memory-efficient streaming +- Parallel data loading with prefetching + +## Basic Usage + +```python +from megatron.energon import get_train_dataset, get_loader, WorkerConfig + +# Create dataset +ds = get_train_dataset( + '/path/to/dataset', + batch_size=32, + shuffle_buffer_size=1000, + worker_config=WorkerConfig.default_worker_config(), +) + +# Create loader and iterate +for batch in get_loader(ds): + # Training step + pass +``` + +## Multimodal Example + +```python +# Load image-text dataset +ds = get_train_dataset( + '/path/to/multimodal/dataset', + batch_size=32, + worker_config=WorkerConfig(num_workers=8, prefetch_factor=2), +) + +for batch in get_loader(ds): + images = batch['image'] # Image tensors + texts = batch['text'] # Text captions + # Process batch +``` + +## Dataset Blending + +Mix multiple datasets with custom weights: + +```python +from megatron.energon import Blender + +blended_ds = Blender([ + ('/path/to/dataset1', 0.6), # 60% + ('/path/to/dataset2', 0.3), # 30% + ('/path/to/dataset3', 0.1), # 10% +]) +``` + +## Configuration + +### Worker Configuration + +```python +WorkerConfig( + num_workers=8, # Parallel workers + prefetch_factor=2, # Batches to prefetch per worker + persistent_workers=True, # Keep workers alive between epochs +) +``` + +### Common Parameters + +The following table summarizes frequently used dataset and loader parameters: + +| Parameter | Description | +|-----------|-------------| +| `batch_size` | Samples per batch | +| `shuffle_buffer_size` | Buffer size for randomization | +| `max_samples_per_sequence` | Max samples to pack into one sequence | +| `worker_config` | Worker configuration for parallel loading | + +## Integration with Megatron-LM + +```python +from megatron.energon import get_train_dataset, get_loader +from megatron.training import get_args + +args = get_args() + +train_ds = get_train_dataset( + args.data_path, + batch_size=args.micro_batch_size, +) + +for iteration, batch in enumerate(get_loader(train_ds)): + loss = train_step(batch) +``` + +## Resources + +- **[Megatron Energon GitHub](https://github.com/NVIDIA/Megatron-Energon)**: Documentation and examples +- **[Multimodal Examples](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal)**: Megatron-LM multimodal training + +## Next Steps + +- Refer to [Multimodal Models](../../models/multimodal.md) for supported architectures +- Refer to [Training Examples](../training-examples.md) for integration examples diff --git a/docs/user-guide/features/megatron_fsdp.md b/docs/user-guide/features/megatron_fsdp.md new file mode 100644 index 00000000000..36fcc68893c --- /dev/null +++ b/docs/user-guide/features/megatron_fsdp.md @@ -0,0 +1,608 @@ + + +# Megatron-FSDP + +## ✨ Overview + +**Megatron-FSDP** is an NVIDIA-developed distributed parallelism library written in native PyTorch that provides a high-performance implementation of **Fully Sharded Data Parallelism (FSDP)**. It offers seamless cross-compatibility with various deep learning frameworks and parallelism libraries such as Megatron-Core, and is performance-optimized to support training and inference of extremely large PyTorch models at data-center scale on NVIDIA GPUs. + +- PyPI: https://pypi.org/project/megatron-fsdp/ +- Source Code: https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/distributed/fsdp/src + +### 🧩 Compatibility + +- PyTorch **[DeviceMesh](https://docs.pytorch.org/docs/2.11/distributed.html#torch.distributed.device_mesh.DeviceMesh)**, **[DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html)**, and **[Distributed Checkpoint (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html)** +- **[Megatron Core](https://github.com/NVIDIA/Megatron-LM)** +- **[TransformerEngine](https://github.com/NVIDIA/TransformerEngine)** +- **[NVIDIA NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)** + +### 💡 Features + +- **Performant & Scalable**: Optimized for NVIDIA CUDA with efficient memory management and performance. Sports near-linear scaling up from single compute nodes to entire data-centers. +- **Multiple Algorithms in One**: Supports sharding your choice of optimizer states, gradients, and model parameters (FSDP), including hierarchical data parallelism strategies such as **Hybrid-Sharded Data Parallelism (HSDP)** and **Hybrid-FSDP (HFSDP / Fully-Sharded Optimizer State)** for optimizing intra-node and inter-node memory, communication, and performance. +- **"Bring Your Own Parallelism"**: Works seamlessly with PyTorch, Megatron-LM, Megatron-Bridge, and TransformerEngine, and can be plugged into other frameworks such as HuggingFace Transformers and TorchTitan. +- **Simple & Powerful**: Similar to PyTorch FSDP, the `fully_shard` API doesn't depend on any complex training framework or distributed environment. + +### ⏱️ Optimizations + +- **[TransformerEngine](https://github.com/NVIDIA/TransformerEngine) Mixed-Precision & Fused Kernels**: Native performance- and memory-optimal _compatibility with MXFP8, NVFP4, and various other quantization recipes and fused kernels provided by TransformerEngine_. +- **Advanced Bucketing**: `dtype`-customizable and precision-aware bucketing system to _tune the memory overhead, numerical accuracy, and latency of collectives_. Avoids redundant `COPY` operations before and after collectives, while remaining compatible with **[DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html)** features such as **[Torch Distributed Checkpoint (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html)**. +- **Buffer Management**: Efficient use of storage and [NCCL User Buffer Registration](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#user-buffer-registration) enable _direct communication into NCCL-managed memory_, achieving true zero-`COPY` data movement. Introduced in NCCL `v2.27`, **NCCL Symmetric Memory** communications employ _symmetric kernels_ that drastically reduce SM utilization and include networking optimizations such as high-precision (`FP32`) reduction over-the-wire. +- **Optimized Communication & SM Utilization via SHARP**: Leverages [**SHARP** (Scalable Hierarchical Aggregation and Reduction Protocol)](https://docs.nvidia.com/networking/display/sharpv3130) to _offload FSDP collectives to network switches (InfiniBand or NVLink-Switch)_ and significantly reduce utilization of GPU streaming multi-processors (SM) from 16-32 to 1-6 for **Multi-Node NVLink (MNNVL)** systems (Grace-Blackwell, Vera-Rubin, etc.), which lowers communication latency in large scaled-out workloads and frees up GPU-hosted processors for overlapped compute (GEMM) kernels. When FSDP sharding domains span both NVLink and InfiniBand, **hierarchical SHARP collectives** (NVL-SHARP and IB-SHARP) _optimize communication paths across the entire system topology_. +- [**Hybrid-FSDP (HFSDP)**](#understanding-hybrid-fsdp-hfsdp), a variation of _Hybrid-Sharded Data Parallelism (HSDP)_ that further shards the optimizer state across intra- and inter-node data-parallel ranks, _bridges the memory-communication trade-off between HSDP and FSDP_, unlocking memory efficiency at minimal cost to performance. + +## 🚀 Quick Start + +### 📦 Installation + +#### NeMo Framework Container + +Megatron-FSDP is pre-installed with Megatron-Core in the [NVIDIA NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags). + +#### Megatron-Core + +Megatron-FSDP is bundled with Megatron-Core, which can be installed via `pip`: + +``` +# Install via PyPI +pip install --no-build-isolation megatron-core[mlm,dev] + +# Install from Source +git clone https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM +pip install --no-build-isolation .[mlm,dev] +``` + +To import Megatron-FSDP in Python: +```python +import megatron.core.distributed.fsdp.src.megatron_fsdp +``` + +#### PyPI + +To install Megatron-FSDP as a standalone package to use the `fully_shard` API: + +``` +pip install megatron-fsdp +``` + +To import Megatron-FSDP in Python: + +```python +import megatron_fsdp +``` + +### 🎛️ Megatron-FSDP `fully_shard` + +Megatron-FSDP supports a simple `fully_shard` API that seamlessly enables FSDP with very few lines of code. + +```python +import torch +from megatron_fsdp import ( + fully_shard_model, + fully_shard_optimizer, +) + +# Initialize Torch Distributed. +torch.distributed.init_process_group() +torch.cuda.set_device(torch.distributed.get_rank()) + +# Fully-shard the model. +model = torch.nn.Transformer() +fsdp_model = fully_shard_model( + module=model, + fsdp_unit_modules=[ + torch.nn.TransformerEncoder, + torch.nn.TransformerDecoder + ] +) + +# Fully-shard the optimizer. +toy_adam = torch.optim.AdamW(params=fsdp_model.parameters(), lr=0.01) +optimizer = fully_shard_optimizer(optimizer=toy_adam) + +# Forward pass. +inp = torch.randn(1, 512, 512).to("cuda") +tgt = torch.randn(1, 512, 512).to("cuda") +output = fsdp_model(inp, inp) + +# Backward pass. +torch.nn.functional.mse_loss(output, tgt).backward() + +# Optimizer step. +optimizer.step() +optimizer.zero_grad() + +# Checkpoint the model and optimizer. +torch.distributed.checkpoint.save({ + "model": fsdp_model.state_dict(), + "optimizer": optimizer.state_dict(), +}, checkpoint_id="ckpt/") + +# Load the saved checkpoint. +ckpt = { + "model": fsdp_model.state_dict(), + "optimizer": optimizer.state_dict(), +} +torch.distributed.checkpoint.load(state_dict=ckpt, checkpoint_id="ckpt/") +fsdp_model.load_state_dict(ckpt["model"], strict=False) +optimizer.load_state_dict(ckpt["optimizer"]) +``` + +> ℹ️ `fully_shard` is an _**experimental**_ API. Please check back for updates as we fine-tune our user experience! For more examples using `fully_shard` for Megatron-FSDP, refer to our suite of unit tests: [`tests/unit_tests/distributed/megatron_fsdp/test_mfsdp_fully_shard.py`](../../../tests/unit_tests/distributed/megatron_fsdp/test_mfsdp_fully_shard.py) + +### 🤖 Megatron-LM + +Megatron-FSDP is deeply integrated into Megatron-Core. To enable FSDP (where optimizer states, gradients, and compute parameters are sharded) in Megatron, use the following arguments: + +``` +# Train models in Megatron-LM using Megatron-FSDP. +--use-megatron-fsdp +--data-parallel-sharding-strategy {no_shard, optim, optim_grads, optim_grads_params} +--ckpt-format fsdp_dtensor +``` + +Complete Llama-8B and DeepSeek-V3 training scripts using Megatron-FSDP with recommended settings can be found in [Megatron-LM/examples/megatron_fsdp](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/megatron_fsdp). + +#### Recommended Configuration for Megatron-LM + +Frequently-used options use with Megatron-FSDP include: + +```bash +# Un-set CUDA_DEVICE_MAX_CONNECTIONS to ensure stream independence / full-parallelization of FSDP computation and communication. May slightly affect TP and CP performance though. +unset CUDA_DEVICE_MAX_CONNECTIONS + +# Meta-Device Initialization - Load large model onto CUDA devices in shards to avoid OOM. +--init-model-with-meta-device + +# Per-Token Loss / No Gradient Scaling - Deactivate DP scaling during gradient reduction, which can be a drain on SM resources. +--calculate-per-token-loss + +# Decrease gradient reduction and accumulation precision to recommended data-types based on the precision of the model parameters, usually BF16. Reduces communication volume during the backwards pass. Can be further customized with `--megatron-fsdp-main-grads-dtype` and `--megatron-fsdp-grad-comm-dtype`, which are enabled by this argument. +--grad-reduce-in-bf16 + +# Register NCCL user buffers and Megatron-FSDP double buffers to enable zero-copy symmetric kernels and low-SM utilization via SHARP. Improves overall performance but increases memory overhead due to double-buffering and is NOT compatible with `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`. +--use-nccl-ub +--fsdp-double-buffer +--fsdp-manual-registration +``` + +### 🤖 Megatron-Core + +Megatron-FSDP has a lower-level `FullyShardedDataParallel` class API that can be used with a simplified version of Megatron-LM's training loop. + +```python +# Initialize model and optimizer. +ddp_config.use_megatron_fsdp = True +# Megatron-FSDP Base Sharding Strategies: +# no_shard, optim, optim_grads, optim_grads_params +ddp_config.data_parallel_sharding_strategy = "optim_grads_params" +model = GPTModel(transformer_config) +model = FullyShardedDataParallel( + transformer_config, + model, + ddp_config, + fsdp_unit_modules = [TransformerLayer, LanguageModelEmbedding], +) +optimizer = torch.optim.AdamW(model.parameters(), lr=lr) +optimizer = DistributedOptimizer(optimizer, [model], [model.param_and_grad_buffer]) + +# Training loop +def train_step(inputs, labels): + optimizer.zero_grad() + for mbs_input, mbs_label in zip(inputs, labels): + outputs = model(mbs_input) + loss = loss_fn(outputs, mbs_label) + loss.backward() + optimizer.step() + +# Save and load model and optimizer state dict +def model_and_optimizer_state_dict(): + state_dict = { + "model": model.sharded_state_dict(), + "optimizer": optimizer.sharded_state_dict(), + } + return state_dict + +def load_model_and_optimizer_state_dict(state_dict): + model.load_state_dict(state_dict["model"]) + optimizer.load_state_dict(state_dict["optimizer"]) +``` + +### 🔁 Checkpoint Conversion + +Megatron-FSDP checkpointing supports [PyTorch Distributed Checkpoint (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html). In Megatron-LM, this is the `--ckpt-format fsdp_dtensor` checkpointing format. + +#### Converting Torch DCP to Torch Save (Non-Distributed) Checkpoints + +PyTorch has utilities to convert Torch DCP checkpoints to and from regular Torch checkpoints: +```shell +python -m torch.distributed.checkpoint.format_utils --help +usage: format_utils.py [-h] {torch_to_dcp,dcp_to_torch} src dst + +positional arguments: + {torch_to_dcp,dcp_to_torch} + Conversion mode + src Path to the source model + dst Path to the destination model + +options: + -h, --help show this help message and exit +``` +For example: +```shell +python -m torch.distributed.checkpoint.format_utils dcp_to_torch dcp_ckpt/ torch_ckpt.pt +``` +or: +```python +from torch.distributed.checkpoint.format_utils import ( + dcp_to_torch_save, + torch_save_to_dcp, +) + +# Convert DCP model checkpoint to torch.save format. +dcp_to_torch_save(CHECKPOINT_DIR, TORCH_SAVE_CHECKPOINT_PATH) + +# Convert torch.save model checkpoint back to DCP format. +torch_save_to_dcp(TORCH_SAVE_CHECKPOINT_PATH, f"{CHECKPOINT_DIR}_new") +``` +Torch Save checkpoints can then be converted into HuggingFace SafeTensors or other checkpoint formats for distribution. + +> ℹ️ Megatron-FSDP checkpoints have a `module.` prefix pre-pended to all model parameter names in the state dictionary, and converting a Torch Save checkpoint to a Megatron-FSDP Torch DCP checkpoint requires testing. Work-in-progress! + +#### Converting N-D Parallel (`torch_dist`) to Megatron-FSDP (`fsdp_dtensor`) Checkpoints + +As a pre-requisite for checkpoint conversion, dump the parameter group mapping when training with 3D-parallel (DDP, TP, PP) and/or EP: + +```bash +--dump-param-to-param-group-map /path/to/param_to_param_group_map +``` + +and convert the map to a `param_to_param_group_map.json` JSON file in the `/path/to/param_to_param_group_map` directory: + +```bash +python tools/checkpoint/checkpoint_inspector.py print-torch-dcp-in-json /path/to/param_to_param_group_map +``` + +> ℹ️ If you already have a `torch_dist` checkpoint, simply specify the `--dump-param-to-param-group-map /path/to/param_to_param_group_map` flag and run a trivial training or checkpointing experiment to create the `param_to_param_group_map` you need without full pretraining. + +Finally, convert your `torch_dist` checkpoint to the `fsdp_dtensor` format using the `param_to_param_group_map.json`: + +```bash +torchrun --nproc_per_node=8 --nnodes=1 \ + tools/checkpoint/checkpoint_inspector.py \ + convert-torch-dist-to-fsdp-dtensor (--swiglu) \ # --swiglu for specific models. + /path/to/input_torch_dist_checkpoint/ \ + /path/to/output_fsdp_dtensor_checkpoint/ \ + --param-to-param-group-map-json /path/to/param_to_param_group_map.json +``` + +> ℹ️ For multi-node conversion tasks, please refer to the DeepSeek-V3 example script (`sbatch_checkpoint_convert.sh`) in [Megatron-LM/examples/megatron_fsdp](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/megatron_fsdp). + +## Megatron-FSDP Feature Guide & API + +| Optimization | Description | `Megatron-Core` Config | `fully_shard` Config | +|--------------|-------------|----------------------|----------------------| +| **Megatron-FSDP** | Use Megatron-FSDP in Megatron-LM. | `--use-megatron-fsdp` | `fully_shard_model(module)` | +| **Megatron-FSDP Checkpointing** | Save and load un-even DTensor checkpoints using [Torch Distributed Checkpoint (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html). | `--ckpt-format fsdp_dtensor` | `preproc_state_dict_for_dcp_ckpt=True` | +| **Meta Device Initialization** | Megatron-FSDP initializes a meta-initialized model to the CUDA device in shards to avoid OOM on large models. Requires implementation of `Module.reset_parameters()` for per-Module sharded initialization. | `--init-model-with-meta-device` | `init_model_with_meta_device=True` | +| **Distributed Optimizer** | Megatron-FSDP uses Megatron-Core's `DistributedOptimizer`. Automatically set when using Megatron-FSDP. | `--use-distributed-optimizer` | `fully_shard_optimizer(optimizer)` | + +### FSDP Fundamentals + +```{figure} ../../images/megatron_fsdp/DDP_vs_FSDP.png +:alt: FSDP Pipeline +:align: center + +Comparison between Distributed Data Parallelism (DDP) and Fully-Sharded Data Parallelism (FSDP). While gradients are all-reduced in DDP, they are sharded and reduce-scattered with FSDP. + +Source: Meta AI, Ott, Myle, et al. “Fully Sharded Data Parallel: Faster AI Training with Fewer GPUs.” _Facebook Engineering_, 15 July 2021, https://engineering.fb.com/2021/07/15/open-source/fsdp/. +``` + +**Fully Sharded Data Parallelism (FSDP)** is a type of distributed data parallelism (DDP) that shards optimizer state, weight gradients (`wgrad`), and model weights across devices that ingest data-parallel samples for data-parallel training or inference. Activations (`fprop`) and data gradients (`dgrad`) are not sharded or distributed, and are preserved for the backward pass, but can be recomputed during the backward pass, offloaded to CPU, or sharded / routed using other parallelisms such as tensor parallelism (TP), context parallelism (CP), or expert parallelism (EP). + +```{figure} ../../images/megatron_fsdp/zero3_model_state.png +:alt: ZeRO-3 Model State +:align: center + +Sharded memory profiles for ZeRO-1 (optimizer state), ZeRO-2 (optimizer state and gradients), and ZeRO-3 (optimizer state, gradients, and parameters). + +Source: Zero-Redundancy Optimizer Model State Partition Diagram. From _The Ultra-Scale Playbook: Training LLMs on GPU Clusters_ by Tazi, Nouamane, et al. HuggingFace, 2025, https://huggingface.co/spaces/nanotron/ultrascale-playbook. +``` + +The core principles of FSDP are: + +- Only a small depth-wise fraction of the model state can exist un-sharded at any point in time. +- Communication should overlap computation. + +From these core principles, software requirements can be derived: + +0. Model states sharded by FSDP are directly initialized across devices in shards. +1. Model parameters are all-gathered (AG) in pre-designated groups or modules pre-forward and pre-backward to un-shard a small fraction of the model state at any point in time during training or inference. After `fprop` and `dgrad` computation, the un-sharded weights are immediately de-allocated. +2. `wgrad` are reduce-scattered (RS) and accumulated in pre-designated groups or modules immediately post-backward to limit the amount of un-sharded gradients at any point in time during training or inference. +3. Distributed optimizers, optimizers that are initialized with respect to a sharded model state and support distributed mechanics, update the sharded model state using the reduced gradient shard to implement data parallelism (DP). +4. Computation and communication are overlapped across multiple CUDA streams, expending multiple streaming multi-processors (SM). Weights from subsequent groups or modules are pre-fetched, which ideally hides the communication latency required for FSDP behind model computation kernels (GEMM). + +FSDP can also be visualized as a decomposition of the all-reduce collective used in DDP into a gradient reduce-scatter, distributed optimization step, and parameter all-gather. + +```{figure} ../../images/megatron_fsdp/FSDP_Allreduce.png +:alt: FSDP RS & AG +:align: center + +Source: Feng, Wei, Will Constable, and Yifan Mao. “Getting Started with Fully Sharded Data Parallel (FSDP2).” _PyTorch Tutorials_, 17 Mar. 2022, https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html. +``` + +### FSDP Unit Modules + +| Optimization | Description | `Megatron-Core` Config | `fully_shard` Config | +|--------------|-------------|----------------------|----------------------| +| **FSDP Unit Modules** | A list of `str` or `class` import paths for `torch.nn.Module`(s) that are considered FSDP unit modules and sharded by Megatron-FSDP. Parameters and sub-modules that are not members of an FSDP unit are not sharded. | Defaults to supported Megatron-Core modules (`TransformerLayer`, etc.) in Megatron-LM. | `fsdp_unit_modules=[...]` | +| **FSDP Double Buffer Allocator** | Megatron-FSDP uses the double-buffer allocator, which persistently allocates a buffer pair assigned to alternating FSDP units that temporarily stores parameters and gradients. Automatically used with NCCL user buffer registration. | `--fsdp-double-buffer` | `fsdp_double_buffer=True` | +| **Param All-Gather Overlap** | Whether to overlap parameter all-gather with compute. Automatically activated for the ZeRO-3 sharding strategy. | `--overlap-param-gather` | `overlap_param_gather=True` | +| **Gradient Reduce-Scatter Overlap** | Whether to overlap gradient reduce-scatter or all-reduce with compute. Automatically activated for ZeRO-2 and ZeRO-3 sharding strategies. | `--overlap-grad-reduce` | `overlap_grad_reduce=True` | +| **FSDP Communication Size** | Customize the size (in `numel()` elements) of AG and RS communications in Megatron-FSDP, by limiting how many elements are concurrently pre-fetched or reduced for AG and RS. Effectively suggests how many FSDP units are processed concurrently, which may launch collectives earlier and improve performance. Optionally, tune this value depending on system memory and performance requirements. | `--suggested-communication-unit-size ` | N/A (Megatron-Core Only) | + +> Only a small depth-wise fraction of the model state can exist un-sharded at any point in time. + +**FSDP Unit Modules** represent fractions of the model state that are computed and communicated as a (coalesced) group, un-sharded when needed for computation, and re-sharded after computation to release memory for subsequent model states. Implicitly, an FSDP unit module is also a **_modeling contract_**, requiring that FSDP-managed unit module parameters are not accessed or modified beyond the scope of the forward pass, backward pass, or optimization step. + +Megatron-FSDP accepts a list of `str` or `class` paths representing FSDP unit modules via the `fsdp_unit_modules` argument, which is currently hard-coded to supported model classes (like `TransformerLayer`) in Megatron-Core. It performs a depth-first traversal of the model (via `torch.nn.Module.named_modules()`) and groups the parameters of each matching module for sharding and coalesced communication. Nested units are resolved by precedence: if a module matches an FSDP unit class but is already a sub-module of a previously registered FSDP unit, it is skipped, so the outermost (and necessarily largest) FSDP unit class in any module sub-tree becomes the effective FSDP unit module. + +> Communication should overlap computation. + +Once a model is partitioned into unit modules, computation is overlapped with communication based on the granularity of the FSDP unit module. Depending on the size of the compute and communication kernels, fine-tuning the unit module size and grouping configuration can impact performance and elicit trade-offs between overlap and memory when using FSDP. + +```{figure} ../../images/megatron_fsdp/fsdp_streams.png +:alt: FSDP Streams +:align: center + +Each color-coded block in the compute and communication streams, merged and categorized in the simplified (and worst-case) scenario where SM resources are under contention, correspond to a _single_ FSDP unit module. +``` + +Compute-communication overlaps are orchestrated using **CUDA streams** that capture and parallelize serial operations. All collectives associated with all combinations of `{DP-Inner, DP-Outer}` and `{AG, RS}` are scheduled and tracked with separate streams and communicators / `ProcessGroup`(s). + +- Parameters are un-sharded prior to `fprop` and `dgrad` computation. To overlap the pre-fetch all-gather with computation, at least two FSDP units worth of un-sharded weight memory is required at any point in time. +- Gradients are reduced and sharded after `wgrad` computation. To overlap gradient reduce-scatter with `wgrad` computation, at least two FSDP units worth of un-sharded gradient memory is required at any point in time. + +#### FSDP Module Hooks + +To implement these "unit-periodic" mechanics, Megatron-FSDP uses `Module` hooks to install a variety of (pre- and post-) forward and backward operations: + +- **Pre-Forward** + - Un-shards the model parameters of the current and (via pre-fetching) forward-subsequent FSDP unit modules. + - When `MegatronFSDP.forward()` is invoked, Megatron-FSDP will swap all parameter references to point to the un-sharded `Tensor` compute weights for the forward and backward pass. +- **Post-Forward** + - Re-shards model weights after the forward pass, if the module is an FSDP unit. Non-unit modules remain persistently un-sharded. + - When using activation recomputation during the backwards pass, computing both `fprop` and `dgrad` requires these parameters, so parameters are resharded during **Post-Backward**. + - Releases the transpose cache of quantized parameters (in FSDP / ZeRO-3) for specific quantization recipes in `TransformerEngine`. +- **Pre-Backward** + - Un-shards the model parameters of the current and (via pre-fetching) backward-subsequent FSDP unit modules. + - Implemented as a `torch.autograd.graph.register_multi_grad_hook` triggered by the output `dgrad`, and installed via a `Module` _post-forward_ hook. +- **Post-Backward** + - Re-shards model weights after the backward pass, if the module is an FSDP unit. Non-unit modules remain persistently un-sharded. + - Implemented by injecting an Autograd function (`RegisterFSDPBackwardFunction`) that is installed during a `Module` _pre-forward_ hook. + - Reduces gradients after the backward pass. + - Implemented using a `Tensor.register_post_accumulate_grad_hook` triggered by `param.grad`, as well as a root-level post-backward hook installed during **Pre-Backward** (`torch.autograd.Variable._execution_engine.queue_callback`). +- **State Dictionary** + - When `module.state_dict()` (for any module managed by Megatron-FSDP) is invoked, Megatron-FSDP will swap all parameter references to point to sharded `DTensor` main weights for distributed optimization and checkpointing. + - When `MegatronFSDP.load_state_dict()` is invoked, both the main and compute weights are updated. When using quantized model compute, the main weights are quantized and sharded. + +#### Double Buffering + +Megatron-FSDP uses a `Tensor._typed_storage()._resize_(bytes)`-based allocator to instantly allocate and de-allocate memory without depending on the `CUDACachingAllocator` for un-sharded parameters and gradients by default. (Cache fragmentation and garbage collection can procrastinate large quantities of `cudaMalloc` and `cudaFree` operations that can block programs and spike memory, particularly when memory utilization is maxed out.) However, modifying the underlying storage of a buffer is not compatible with NCCL symmetric registration or CUDA graphability, which require a persistent state during runtime. + +To support these optimizations, Megatron-FSDP uses **double-buffering**, which assigns 2 persistently-allocated buffers to FSDP units in an alternating pattern, hard-limiting the memory overhead for parameter and gradient buffer allocation and ensuring that no more than 2 FSDP units are computed or communicated concurrently. + +```{figure} ../../images/megatron_fsdp/fsdp_double_buffer.png +:alt: FSDP Double Buffering +:align: center + +Visualization of double buffering in Megatron-FSDP. Even- and odd-indexed FSDP units share the same un-sharded parameter and gradient buffers, overwriting incumbent data as needed during runtime. Megatron-FSDP ensures that no more than two FSDP units are un-sharded at any point during runtime. +``` + +With double-buffering, Megatron-FSDP does not need to allocate memory after initialization, which can reduce memory fragmentation and improve performance. However, double-buffering requires _depth-wise model symmetry_, where even- and odd-indexed FSDP units have identical size during runtime. If double-buffering is utilized, Megatron-FSDP computes the **_mode_** of FSDP unit sizes as the symmetrical double-buffer size, and any FSDP units not symmetrical to the computed size will default to the `_resize_(bytes)`-based allocator (or persistently allocated for extremely large and asymmetrical layers that affect performance significantly like `torch.nn.Embedding` when the low-level argument `fsdp_db_use_persist_buf_on_alloc_fail` is set). + +### Data-Parallel Sharding Strategies + +| Optimization | Description | `Megatron-Core` Config | `fully_shard` Config | +|--------------|-------------|----------------------|----------------------| +| **Data Parallel Sharding Strategy** | Primary data-parallel sharding strategy for FSDP, which supports DDP, ZeRO-1 (optimizer), ZeRO-2 (optimizer and gradients), and ZeRO-3 (optimizer, gradients, and parameters). Typically uses intra-node communications, i.e. "inner" or "intra" DP. | `--data-parallel-sharding-strategy {no_shard, optim, optim_grads, optim_grads_params}` | `zero_dp_strategy={no_shard, optim, optim_grads, optim_grads_params, 0, 1, 2, 3}` | +| **DP-Outer Sharding Strategy** | Secondary data-parallel sharding strategy for HSDP, which supports Hybrid-Sharded Data Parallel (HSDP / `no_shard`) and Hybrid-FSDP (HFSDP / `optim`). Typically uses inter-node communications, i.e. "outer" or "inter" DP. | `--outer-dp-sharding-strategy {no_shard, optim}` | `outer_dp_sharding_strategy={no_shard, optim, 0, 1}` | +| **Hybrid Data Parallelism Size** | Specify the DP-Outer / Inter-DP parallel size. DP-Inner / Intra-DP sizes will be deduced from the sizes of other parallelisms and `torch.distributed.get_world_size()`. | `--num-distributed-optimizer-instances ` | `dp_outer_dim=` (Cumulative DP groups `hybrid_fsdp_group` / `hybrid_fsdp_expt_group` are required for HFSDP.) | + +Megatron-FSDP supports a variety of sharding strategies over a variety of distributed topologies: + +- **Distributed Data Parallelism (DDP)** + - Model state is replicated across DP ranks. + - Gradient all-reduce is overlapped with backward compute and launched during the last backward pass before the optimization step. +- **ZeRO-1** + - Optimizer state is sharded across DP ranks. + - Gradient reduce-scatter is overlapped with backward compute and launched during the last backward pass before the optimization step. (Reduce-scatter is used in lieu of all-reduce for performance, because only a shard of the gradient is needed for optimization.) +- **ZeRO-2** + - Optimizer state and gradients are sharded across DP ranks. + - Gradient reduce-scatter is overlapped with backward compute and accumulated during every backward pass. +- **Fully-Sharded Data Parallelism (FSDP / ZeRO-3)** + - Optimizer state, gradients, and parameters are sharded across DP ranks. + - Gradient reduce-scatter is overlapped with backward compute and accumulated during every backward pass. +- **Hybrid-Sharded Data Parallelism (HSDP)** + - Optimizer state, gradients, and parameters are sharded across the "inner" or "intra" DP ranks. + - Model state is replicated across "outer" / "inter" DP ranks, and outer data-parallel gradients are all-reduced during the last backward pass before the optimization step. +- **Hybrid-FSDP (HFSDP)** + - Optimizer state, gradients, and parameters are sharded across the "inner" or "intra" DP ranks. + - Optimizer state is _further_ sharded across "outer" / "inter" DP ranks. + - Outer data-parallel gradients are reduce-scattered after during the last backward pass before the optimization step. + - Outer data-parallel parameters are all-gathered during the first forward pass after the optimization step. + - FSDP primary sharding (`optim_grads_params`) is required for HFSDP secondary sharding (`optim`). + - Requires passing cumulative data-parallel groups (`hybrid_fsdp_group` / `hybrid_fsdp_expt_group`), which include ALL data-parallel ranks, to Megatron-FSDP. + - To create these using `DeviceMesh`, create a data-parallel `DeviceMesh` for the cumulative DP group and use `DeviceMesh._unflatten(dp_dim, mesh_sizes=(dp_outer_size, dp_inner_size), mesh_dim_names=("dp_outer_dim", "dp_shard_dim"))` to construct a `DeviceMesh` with DP-Inner and DP-Outer mesh dimensions for Hybrid-FSDP. + +#### Understanding Hybrid-FSDP (HFSDP) + +```{figure} ../../images/megatron_fsdp/hfsdp.png +:alt: Hybrid-FSDP Topology +:align: center + +Hybrid-FSDP (HFSDP) is a variation of HSDP where the optimizer state in particular is sharded across both DP-Inner and DP-Outer, i.e. all data-parallel ranks, which further reduces memory utilization. In other words, intra-node sharding and communication uses ZeRO-3, while inter-node sharding and communication uses ZeRO-1. Parameters and gradients are converted from and to the fully-sharded optimizer state during optimization steps only, reducing the frequency of inter-node communications. + +Inspired by the artistry in the DHEN (Zhang, Luo, Liu, Meta, et al., 2022) paper: https://arxiv.org/abs/2203.11014 +``` + +**Hybrid-Fully Sharded Data Parallelism (HFSDP)** is a slight modification to HSDP that fully-shards the optimizer state across all data-parallel ranks and introduces outer-level all-gather and reduce-scatter collectives to map fully-sharded parameters and gradients into partially-sharded parameters and gradients. + +The memory profile of HFSDP is a "hybrid" of FSDP (optimizer state) and HSDP (gradients and model weights). Another elegant way to understand HFSDP functionality is ZeRO-1 composed with ZeRO-3. + +$$\text{Hybrid-FSDP Memory Profile} = \frac{\text{Optimizer State}}{\text{DP-Inner} \ \times \ \text{DP-Outer}} + \frac{\text{Gradient} + \text{Weight}}{\text{DP-Inner}}$$ + +The modified algorithm has the following characteristics: + +- Megatron-FSDP maintains a view of the model parameters sharded across all data-parallel ranks. + - Distributed checkpoints save and load the fully-sharded model parameters. + - Distributed optimizer state is initialized on the fully-sharded model parameters. +- During the first forward pass after checkpointing or optimization, fully-sharded model weights are all-gathered into partially-sharded model weights. +- During the last backward pass before optimization, partially-sharded model gradients are reduce-scattered into fully-sharded model gradients. +- Otherwise, FSDP is performed on the partially-sharded model weights and accumulated gradients. Because model weights and gradients are only updated and ingested once per optimization cycle, we can skip or postpone all expensive inter-node / DP-outer collectives until an optimization step.​ + +In addition to improved memory utilization, HFSDP communications are split in communication size (bytes communicated), communication topology (DP-Inner and DP-Outer groups), and communication domain (NVLink and InfiniBand) across two sharding stages. + +```{figure} ../../images/megatron_fsdp/fsdp_v_hfsdp_streams.png +:alt: Hybrid-FSDP Streams +:align: center + +Inter-node communications can also be parallelized with intra-node communications using separate CUDA streams. +``` + +#### Mixing FSDP & Model Parallelism + +Megatron-FSDP is also compatible with a variety of model parallelisms that shard the model state, such as **Tensor Parallelism (TP)** and **Expert Parallelism (EP)**. When sharding model states across multiple dimensions in the device topology, _**FSDP sharding is always performed last**_, because FSDP collectives un-shard and re-shard parameters and gradients immediately before and after computation. Thus, FSDP sharding mechanics are implemented over tensor and expert parallel (strided) shards. + +```{figure} ../../images/megatron_fsdp/mixed_sharding.png +:alt: Mixed Model Parallelism +:align: center + +Wheneveer FSDP is composed with other model parallelisms, FSDP sharding is always exercised last to seamlessly integrate with existing model shards. +``` + +Megatron-FSDP uses `torch.distributed.DeviceMesh` to describe and configure communications across devices in data-parallel group(s). Because heterogeneous models that have mixed layers, such as [Hybrid Mamba-Transformer](https://arxiv.org/abs/2504.03624) or [Mixture-of-Experts (MoE)](https://arxiv.org/abs/1701.06538) models, require different parallelism configurations, multiple `DeviceMesh`(s) may be required for specific layers that require distinct distributed topologies for optimal memory efficiency and performance. + +Currently, Megatron-FSDP supports two `DeviceMesh`(s), one for dense / non-expert `Module`(s) and another for Megatron-Core MoE sparse / expert `Module`(s). (Expert modules and parameters in Megatron-Core are automatically detected.) + +- Dense modules typically have a `DeviceMesh` with data parallel, tensor parallel, and context parallel dimensions, where the data parallel dimension is used for FSDP. Typically, both data-parallel and context-parallel ranks are used for sharding in FSDP. +- Mixture-of-experts modules typically have a `DeviceMesh` with data parallel, tensor parallel, and expert parallel dimensions, where the data parallel dimension is used for FSDP. + +For more information about Mixture-of-Experts in Megatron-Core, refer to the [Megatron-Core User Guide - MoE](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/moe.html). + +#### Non-Uniform / Un-Even Model Sharding + +While `torch.distributed.tensor.DTensor` defaults to per-parameter sharding, where Tensors are split evenly on `dim=0` across the data-parallel domain, Megatron-FSDP uses **non-uniform or un-even `DTensor` shards** of a (flattened) group of parameters associated with an FSDP unit. + +```{figure} ../../images/megatron_fsdp/uneven_sharding.png +:alt: Non-Uniform Sharding +:align: center + +Comparison of FSDP2 per-parameter sharding and Megatron-FSDP per-unit or per-module sharding. FSDP2 requires `COPY` operations to move parameters and gradients in and out of communication buffers to reduce the frequency of NCCL collective calls, while Megatron-FSDP assigns sliced views of contiguous communication buffers to parameters associated with an FSDP unit. +``` + +While complex and less user-intuitive, an un-evenly sharded data structure enables a few performance benefits without introducing expensive `COPY` operations to set up communication and computation buffers: + +- **Fewer NCCL calls**, reducing kernel launch and synchronization overhead. Only parameters in FSDP units that have different communication-related properties, such as their `dtype` or distributed topology, are coalesced into separate NCCL calls. +- Flat communication and computation buffers are **contiguous-by-design**, supporting optimized CUDA kernels that require buffers backed by contiguous memory, such as grouped GEMMs used in MoE. + +Effectively, this implies that the same `DTensor`-sharded model parameters may have completely different shapes on different ranks, and if entire parameters are assigned to other ranks, the local `Tensor` will be empty. + +> ℹ️ Megatron-FSDP has a handy library ([`megatron_fsdp.uneven_dtensor`](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py)) for manipulating un-evenly sharded `DTensors`, focused on per-parameter operations like un-sharding or reducing parameters that have different shapes across ranks. While the parameter group is evenly-sharded for FSDP collectives, per-parameter collectives (that assume a symmetrical amount of bytes are communicated between devices) will hang waiting on bytes that will never arrive for un-evenly sharded `DTensors`. + +In particular, contiguous memory is only half the requirement for high-performance CUDA kernels. The other requirement is **locality**, which FSDP can violate, that introduces compatibility issues when combining FSDP with present and future optimizations. For example, block-wise quantization (scaling factor / `absmax` calculations for MXFP8, NVFP4, etc.) requires DP communication and custom max-reduce kernels if the block is sharded by FSDP. + +Megatron-FSDP supports `dim=0` sharding, which computes the _**least-common multiple (LCM) of `p.shape[1:]` for all parameters `p` in an FSDP unit**_ and _**pads the un-sharded buffer to the closest multiple of `DP x LCM(p.shape[1:])`**_, forming a "DP-LCM" partition with `LCM`-length parts to ensure that DP-sharding boundaries do not violate chunks of data for coordinates of `dim=0`. + +```{figure} ../../images/megatron_fsdp/lcm_dim0_shard.png +:alt: Flat Buffer Sharding Algorithm +:align: center + +Visualization of how parameters are assigned un-evenly to the flat per-unit buffer sharded across DP ranks. With the LCM algorithm, every slice of `dim=0` is never bisected by FSDP. Algorithms and compute kernels can leverage this locality and contiguity. +``` + +1. When a parameter is _divisble by the LCM_, it can be inserted at any index multiple of the LCM in the buffer that is free. `p[i]` chunks of this parameter by definition divide the LCM, and thus align with the DP-LCM sharding grid. +2. When a parameter _is larger than but not divisible by the LCM_, the remainder `r` populates a fraction of another LCM part, so a "conjugate" parameter that also exceeds the LCM with a "conjugate" remainder `r'` that is less than or equal to `LCM - r` is installed to fill the remaining space and align with the DP-LCM sharding grid. +3. When a parameter _is smaller than but not divisible by the LCM_, a post-assignment sweep on the leftover space in the flat buffer is run, and all gaps that are multiples of the LCM that are large enough to support the entire parameter are utilized. Once all gaps are filled, the final parameters are assigned to the tail of the buffer respecting the DP-LCM sharding grid. + +> ℹ️ Generalized support for contiguity and locality in Megatron-FSDP is a **_work-in-progress_** and will evolve with contribution from the OSS community and PyTorch. For more information about how kernel buffer requirements affect the design of FSDP data structures, refer to the [veScale: Consistent and Efficient Tensor Programming with Eager-Mode SPMD (Li, Youjie, ByteDance Seed, et al.)](https://arxiv.org/abs/2509.07003) paper that comprehensively analyzes these requirements. + +### Mixed-Precision & Quantization + +| Optimization | Description | `Megatron-Core` Config | `fully_shard` Config | +|--------------|-------------|----------------------|----------------------| +| **Quantized Parameters** | Megatron-FSDP will shard and all-gather TransformerEngine-quantized parameters for computation. Quantized parameters are updated every optimization step, and both row-wise (FWD) and column-wise (BWD) data are managed for non-transposable 1-D quantization recipes like MXFP8. Otherwise, only activations are quantized. | `--fp8-param-gather` | TransformerEngine `quantized_model_init()` | +| **Main Parameter (Optimization / Checkpoint) Data-Type** | Data-type for optimization and checkpointing parameters. If set to `auto`, model compute weights are utilized instead. Required for `--fp8-param-gather`. Defaults to FP32. | `--megatron-fsdp-main-params-dtype {fp32, bf16, fp16, auto}` | `MixedPrecisionPolicy(main_params_dtype=...)` | +| **Main Gradient (Accumulation) Data-Type** | Data-type for gradient accumulation. If set to `auto`, main gradient precision will be derived from model parameter precision. Defaults to `auto`. | `--megatron-fsdp-main-grads-dtype {fp32, bf16, fp16, auto}` | `MixedPrecisionPolicy(main_grads_dtype=...)` | +| **Gradient Communication (Reduction) Data-Type** | Data-type for gradient communication and reduction. If set to `auto`, the main gradient precision will be used for communication. (When using NCCL symmetric registration, low-precision gradients are reduced in FP32 over-the-wire.) Defaults to `auto`. | `--megatron-fsdp-grad-comm-dtype {fp32, bf16, fp16, auto}` | `MixedPrecisionPolicy(grad_comm_dtype=...)` | +| **Weight Gradient Accumulation Fusion** | When using TransformerEngine modules, Megatron-FSDP implements `get_main_grad` to allocate un-sharded gradient buffers called by TransformerEngine, to avoid `COPY`-ing the gradient to Megatron-FSDP communication buffers. Used by default and can be deactivated with `--no-gradient-accumulation-fusion`. | `--no-gradient-accumulation-fusion` | N/A (Megatron-Core Only) | +| **Precision-Aware Optimizer** | Use the TransformerEngine `FusedAdam` optimizer, and Megatron-FSDP will install the gradient in a temporary attribute `Parameter.decoupled_grad` which is consumed by `FusedAdam`. Megatron-FSDP manages the main parameters, but the optimizer state precision can be customized with `--exp-avg-dtype` and `--exp-avg-sq-dtype`, which both support `fp8` optimization state. | `--use-precision-aware-optimizer` | `use_decoupled_grad=True` | + +#### Quantization + +Quantization is an extremely important feature for Megatron-FSDP as it reduces memory utilization and communication size for both activations and parameters, which directly affects the viability and performance of FSDP. + +```{figure} ../../images/megatron_fsdp/quantized_param_gather.png +:alt: Quantized Model Parameters & FSDP +:align: center + +Visualization of Megatron-FSDP's training loop when using quantized weights from TransformerEngine. Every optimization step updates the quantized representation of sharded model weights, which have reduced communication size. +``` + +While TransformerEngine handles activation quantization, Megatron-FSDP shards quantized weights for AG. + +0. _**Quantized Model Initialization**_ - Model is initialized with quantized weights, e.g. MXFP8 or NVFP4. If using `meta` device initialization, Megatron-FSDP will call `reset_parameters()` to initialize quantized weights layer-by-layer. If row-wise and column-wise data are not transposable, Megatron-FSDP will shard and buffer both. Additionally, high-precision main weights are retrieved and sharded for distributed optimization, checkpointing, and quantization. +0. _**Forward / Backward Pass**_ - Quantized weights are un-sharded for both the forward and backward pass. If row-wise and column-wise data aren't transposable, the row-wise weights are gathered for forward, and the column-wise weights are gathered for backward. +0. _**Distributed Optimization Step**_ - Non-quantized accumulated gradient shards from quantized GEMMs are applied to high-precision main weight shards. +0. _**Sharded Quantization**_ - Sharded main weights are quantized to update the quantized compute weights for subsequent training steps. + +```{figure} ../../images/megatron_fsdp/sharded_quantization.png +:alt: Sharded Quantization +:align: center + +Sharded quantization involves reducing maxima to compute a global set of scaling factors for local / sharded quantization. +``` + +In particular, _sharded quantization_ minimizes communication size and memory utilization by communicating scaling factors instead of main weights. + +1. _**Local Abs-Max**_ - For a group of parameters in an FSDP unit, compute local tensor-wise or block-wise maxima across the global un-sharded shape, with zero padding for non-local data. +1. _**Global Abs-Max**_ - Globally all-reduce maxima and derive scaling factors from maxima. +1. _**Local Quantization**_ - Locally quantize sharded main weights and install into compute weight buffers. + +#### Mixed-Precision + +Megatron-FSDP sharding and communication buffers support mixed-precision, such that users can customize the `dtype` used for main weights, gradient communication (reduction), and gradient accumulation in addition to the native or quantized `dtype` used for model computation. These options are wrapped in a `MixedPrecisionPolicy` dataclass. + +- _**Main Weight Precision**_ - Controls the data-type for parameters responsible for distributed optimization, distributed checkpointing, and quantization. If set to `auto` (`None`), the native model compute parameter data-type will be utilized. Required for parameter quantization with `--fp8-param-gather`. Defaults to `torch.float32`. +- _**Main Gradient Precision**_ - Controls the data-type for `wgrad` accumulation and distributed optimization. Defaults to `auto` (`None`), the model native gradient data-type will be utilized. While `torch.float32` (or higher) is recommended for accuracy at scale, as `main_grads_dtype` controls the data-type for gradient accumulation, `auto` is more flexible and uses pre-determined parameter gradient logic in mixed-precision scenarios, such as `BF16` for `FP8`/`FP4` parameters quantized via TransformerEngine. +- _**Gradient Communication Precision**_ - Controls the data-type for gradient communications when reducing gradients. Lower precision improves (communication) performance. Defaults to `auto` (`None`), in which the main gradient data-type will be utilized. If using `no_shard`, `optim`, HSDP, or HFSDP, allocating `dtype`-custom gradient communication buffers may increase per-unit memory overhead, so users should consider the performance-memory trade-off when using this feature. + - If using NCCL symmetric registration `v2.27+`, gradient reduction may be performed in high-precision depending on the network domain (NVLink or IB), and can enable mixed-precision communication and accumulation, e.g. setting grad_comm_dtype to `BF16` can support `FP32` reduction even though we have `BF16` input and output communication buffers. Otherwise, gradients will be reduced and accumulated in communication and accumulation precision as usual. + +### NCCL + +| Optimization | Description | `Megatron-Core` Config | `fully_shard` Config | +|--------------|-------------|----------------------|----------------------| +| **NCCL User Buffers** | Allocate and register Megatron-FSDP communication buffers with NCCL, which enables zero-`COPY`, high-precision reduction, copy-engine collectives, and symmetric kernels. Uses double buffering. | `--use-nccl-ub` | `nccl_ub=True` | +| **NCCL Manual Registration** | Instead of registering NCCL user buffers on first allocation, batch registration of all communication buffers at the end of the initial training step. Reduces registration latency. | `--fsdp-manual-registration` | N/A (Megatron-Core Only) | +| **Disable Symmetric Registration** | Disable symmetric registration with NCCL. Optional, as symmetric registration failure defaults to normal registration. | `--disable-symmetric-registration` | `disable_symmetric_registration=True` | + +[NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl) implements multi-device and multi-node communication primitives optimized for CUDA devices and networking from NVIDIA. Megatron-FSDP communications are registered and deeply integrated with NCCL, which enables a variety of hardware-level networking optimizations such as copy-engine AG, high-precision RS, SHARP reduction offloading, and symmetric kernels. + +To leverage NCCL networking optimizations, **NCCL user buffer registration (UBR)** is required to inform NCCL of PyTorch Tensors ("user buffers") that act directly as the input and target of NCCL collectives for PyTorch `ProcessGroup`(s). Because registered communication buffers are known to NCCL, `COPY` operations that send collective inputs to NCCL buffers and collective outputs to PyTorch buffers are no longer required, which enables Megatron-FSDP to be zero-`COPY` end-to-end. + +NCCL (`v2.27+`) supports symmetric allocation or registration for communicators over the NVLink domain, which allow buffers that share identical virtual addresses across devices to benefit from optimized collectives: + +- **Symmetric Kernels** - On the NVLink domain, symmetric kernels operating on symmetric memory reduces the SM utilization for a single communication kernel to 1. +- **NVSwitch SHARP Offloading** - To further minimize SM utilization for AG and RS collectives, NCCL SHARP offloads reduction and aggregation work to NVLink and IB Switch hardware that uses 1-6 SM depending on the domain: NVL, IB, or NVL + IB. +- **Copy-Engine (CE) Collectives**: Instead of using SMs (or CTAs) for common non-computational collectives like AG in Megatron-FSDP, copy engines are instead used to perform all-gather collectives, dedicating SM resources to compute and reduction during FSDP. Requires NCCL `v2.28+`. +- **High-Precision Reduction**: When training large models, high-precision gradient reduction and accumulation is desired for accuracy and convergence, but communicating FP32 gradients is expensive. With symmetric registration, FP32 accumulators enable gradients to be reduced in FP32 but communicated in BF16, which decreases gradient RS communication latency while maintaining high accuracy during training. Megatron-FSDP supports FP32 main gradient accumulation but BF16 gradient communication, customizable through `megatron_fsdp.MixedPrecisionPolicy`. + +These optimizations significantly reduce SM resource contention for overlapped compute and communication kernels in FSDP. Symmetric registration, allocation, and pooling is also supported in PyTorch: [`torch.distributed._symmetric_memory`](https://docs.pytorch.org/docs/stable/symmetric_memory.html). diff --git a/docs/user-guide/features/megatron_rl.md b/docs/user-guide/features/megatron_rl.md new file mode 100644 index 00000000000..9cd46d79ae2 --- /dev/null +++ b/docs/user-guide/features/megatron_rl.md @@ -0,0 +1,55 @@ + + +# Megatron RL + +Reinforcement learning library for post-training large language models at scale. + +## Overview + +[**Megatron RL**](https://github.com/NVIDIA/Megatron-LM/tree/dev/megatron/rl) adds native reinforcement learning capabilities to Megatron-LM for large-scale RL-based post-training of foundation models. + +> **Note:** Megatron RL is under active development and primarily designed for research teams exploring RL post-training on modern NVIDIA hardware. For production deployments, use [**NeMo RL**](https://github.com/NVIDIA-NeMo/RL). + +## Key Features + +- **Decoupled Design** - Separates agent and environment logic from the core RL implementation +- **Inference Backends** - Megatron, OpenAI, and Hugging Face inference stacks +- **Trainer or Evaluator** - Manages rollout generation and coordinates with inference systems +- **Megatron Integration** - Native integration with Megatron Core inference system + +## Architecture + +### Components + +**Agents and Environments** +- Accept inference handles +- Return experience rollouts with rewards +- Implement custom RL logic + +**Trainer or Evaluator** +- Controls rollout generation +- Coordinates with inference systems +- Manages training loops + +**Inference Interface** +- Exposes a `.generate(prompt, **generation_args)` endpoint +- Supports multiple backends (Megatron, OpenAI, Hugging Face) + +## Use Cases + +- RLHF (Reinforcement Learning from Human Feedback) +- Custom reward-based fine-tuning +- Policy optimization for specific tasks +- Research on RL post-training techniques + +## Resources + +- **[Megatron RL GitHub](https://github.com/NVIDIA/Megatron-LM/tree/dev/megatron/rl)**: Source code and documentation +- **[Megatron Core Inference](../../api-guide/core/transformer.md)**: Native inference integration diff --git a/docs/user-guide/features/moe.md b/docs/user-guide/features/moe.md new file mode 100644 index 00000000000..45efe291a21 --- /dev/null +++ b/docs/user-guide/features/moe.md @@ -0,0 +1,22 @@ + + +# Mixture of Experts + +```{toctree} +:maxdepth: 1 +:caption: MoE Features + +multi_token_prediction +multi_latent_attention +../../api-guide/router_replay +``` + +```{include} ../../../megatron/core/transformer/moe/README.md +``` diff --git a/docs/user-guide/features/multi_latent_attention.md b/docs/user-guide/features/multi_latent_attention.md new file mode 100644 index 00000000000..65a1e573c6c --- /dev/null +++ b/docs/user-guide/features/multi_latent_attention.md @@ -0,0 +1,21 @@ + + +# Multi-Latent Attention + +## Multi-Latent Attention Overview + +Multi-Latent Attention (MLA) is an attention variant from the DeepSeek team. It uses multiple latent spaces to change how attention is computed. That design often lowers cost for large language models (LLMs) compared with standard attention and can shrink the KV cache. The DeepSeek-V2 technical report compares MLA to Multi-Head Attention (MHA) on quality and cache size. + +## Enabling Multi-Latent Attention + +To enable MLA in Megatron-LM, set the following on the command line: + +- `--multi-latent-attention` to turn on MLA. +- Use `MLATransformerConfig` for MLA-specific model settings when you build the training configuration. diff --git a/docs/user-guide/features/multi_token_prediction.md b/docs/user-guide/features/multi_token_prediction.md new file mode 100644 index 00000000000..e2d51c1b705 --- /dev/null +++ b/docs/user-guide/features/multi_token_prediction.md @@ -0,0 +1,58 @@ + + +# Multi-Token Prediction (MTP) + +Multi-Token Prediction (MTP) extends the prediction scope to several future tokens at each position. An MTP objective adds extra prediction targets, which can improve data efficiency. It may also encourage representations that anticipate later tokens. This implementation predicts additional tokens in sequence and preserves the causal dependency chain at each depth. The following figure illustrates MTP as used in [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3/). + +![Diagram of Multi-Token Prediction depth stack: shared embedding, projection, transformer block, and output head per depth](../../images/multi_token_prediction/MTP_implementation.png) + +The *k*-th MTP module includes a shared embedding layer, a projection matrix, a Transformer block, and a shared output head. For the *i*-th input token at depth *k - 1*, the implementation combines the representation of the *i*-th token and the embedding of the *(i + K)*-th token with a linear projection. That combined representation is the input to the Transformer block at depth *k*, which produces the output representation. + +For more detail, refer to the [DeepSeek-V3 technical report](https://arxiv.org/pdf/2412.19437.pdf). + +## Related Arguments + +Train `GPTModel`-style models with MTP by setting `mtp_num_layers` to a positive integer. + +The following table summarizes MTP configuration fields: + +| Item | Description | +| --- | --- | +| `mtp_num_layers` | Number of MTP layers. MTP extends prediction to multiple future tokens at each position. This stack uses `mtp_num_layers` sequential modules to predict that many additional tokens per position. Default: `None`. | +| `mtp_loss_scaling_factor` | Weight for the MTP loss term. The implementation averages MTP losses across depths, multiplies by this factor, and adds the result to the training objective. Default: `0.1`. | + +## Pipeline Parallel Layout for MTP + +MTP supports user-defined placement of MTP layers across pipeline stages through `pipeline_model_parallel_layout`. By default, all MTP layers sit on the last pipeline stage; you can override placement in the layout string. + +### MTP Standalone Mode + +When MTP layers are placed in a separate virtual pipeline (VPP) stage that is not on the last pipeline rank, the `mtp_standalone` flag is automatically set to `True`. MTP then runs in its own pipeline stage. + +### Layout Format + +Use `m` for MTP layers in the pipeline layout string. For example: +- `"E|t*3|(t|)*5mL"` - MTP in the last stage +- `"E|t*3|(t|)*4tm|L"` - MTP in the second-to-last stage with a decoder layer +- `"E|t*3|(t|)*3tt|m|L"` - MTP in a standalone stage (second-to-last) with no other layers + +### Constraints + +- Place all MTP layers in the same virtual pipeline stage. +- Do not place MTP layers on the first pipeline rank. + +## Implementation Notes + +- For models with MTP layers, the final LayerNorm sits in the stage that contains the last decoder layer, not in the post-process stage. That can change gradient norm reduction slightly in deterministic mode when LayerNorm would otherwise live in another stage. For bitwise alignment, disable gradient norm clipping. +- MTP loss is computed in the post-processing stage. + +## Unsupported Combinations + +Context Parallel (CP), arbitrary `AttnMaskType`, and learned absolute position embeddings are not supported with MTP. diff --git a/docs/user-guide/features/optimizer_cpu_offload.md b/docs/user-guide/features/optimizer_cpu_offload.md new file mode 100644 index 00000000000..1496bd0a91e --- /dev/null +++ b/docs/user-guide/features/optimizer_cpu_offload.md @@ -0,0 +1,13 @@ + + +# Optimizer CPU Offload + +```{include} ../../../megatron/core/optimizer/cpu_offloading/README.md +``` diff --git a/docs/user-guide/features/pipeline_parallel_layout.md b/docs/user-guide/features/pipeline_parallel_layout.md new file mode 100644 index 00000000000..69ffb63da5a --- /dev/null +++ b/docs/user-guide/features/pipeline_parallel_layout.md @@ -0,0 +1,37 @@ + + +# Custom Pipeline Model Parallel Layout + +*This is an experimental feature and may be changed.* + +`--pipeline-model-parallel-layout` takes a string that defines pipeline parallel partitioning. Use it to balance partitioning for an imbalanced model. For example, to partition a DeepSeek-V3-style stack (61 decoder layers and one MTP layer) with PP16 and VPP2, pass arguments similar to the following: + +```bash +--pipeline-model-parallel-size 16 +--pipeline-model-parallel-layout "Et*3|(tt|)*29,m|L" +``` + +The table below shows one possible rank map for that layout: + +| PP \ VPP rank | 0 | 1 | +|---------------|-------------------------|---------------| +| 0 | embedding + 3 × decoder | 2 × decoder | +| 1~13 | 2 × decoder | 2 × decoder | +| 14 | 2 × decoder | mtp | +| 15 | 2 × decoder | loss | + +In the layout string, stages are split by `|`. Replicated stages or layers use multiplication (for example, `t*3`). Commas are optional for readability. Symbols: + +* `E`: embedding layer +* `t`: transformer decoder layer +* `m`: MTP layer +* `L`: loss calculation layer + +**Note:** Empty stages are allowed, for example `E||t|L` (the second stage is empty). diff --git a/docs/user-guide/features/tokenizers.md b/docs/user-guide/features/tokenizers.md new file mode 100644 index 00000000000..1455d6e617e --- /dev/null +++ b/docs/user-guide/features/tokenizers.md @@ -0,0 +1,261 @@ + + +# Tokenizers + +Megatron Core provides a unified tokenizer system with a Hugging Face-style API for configuration and loading. + +## Overview + +The `MegatronTokenizer` class uses the same entry points as many Hugging Face workflows for loading and managing tokenizers: + +- **Automatic detection** - Load tokenizer types without naming the backing library in code +- **Metadata-based configuration** - Store tokenizer settings in JSON for reuse across runs +- **Hugging Face-compatible API** - `.from_pretrained()`-style loading +- **Custom tokenizer support** - Extend with model-specific tokenization logic + +## Key Features + +### Unified API + +Use the same API regardless of tokenizer backend (SentencePiece, Hugging Face, TikToken, and so on): + +```python +from megatron.core.tokenizers import MegatronTokenizer + +tokenizer = MegatronTokenizer.from_pretrained("/path/to/tokenizer") +``` + +### Tokenizer Metadata + +Configuration is stored in a JSON metadata file containing: + +- Tokenizer library (Hugging Face, SentencePiece, TikToken, and so on) +- Chat templates +- Custom tokenizer class +- Special token configurations + +**Benefits** + +- Set configuration once, reuse everywhere +- No repeated CLI arguments +- Share setups by copying the tokenizer directory + +### Automatic Library Detection + +The correct tokenizer implementation is selected automatically: + +- Avoids hard-coding `SentencePieceTokenizer`, `HuggingFaceTokenizer`, and related class names in user code +- Library type is read from metadata +- Change tokenizer backends by updating metadata and paths + +## Basic Usage + +### Creating Tokenizer Metadata + +Save tokenizer configuration for reuse: + +```python +from megatron.core.tokenizers import MegatronTokenizer + +# Create metadata for a SentencePiece tokenizer +MegatronTokenizer.write_metadata( + tokenizer_path="/path/to/tokenizer.model", + tokenizer_library="sentencepiece", + chat_template="{% for message in messages %}{{ message.content }}{% endfor %}", +) +``` + +The metadata is saved as `tokenizer_metadata.json` in the tokenizer directory. + +### Loading a Tokenizer + +Load from a directory with metadata: + +```python +from megatron.core.tokenizers import MegatronTokenizer + +# Load with auto-detected configuration +tokenizer = MegatronTokenizer.from_pretrained("/path/to/tokenizer.model") +``` + +### Loading with Custom Metadata Path + +If metadata is stored separately: + +```python +tokenizer = MegatronTokenizer.from_pretrained( + tokenizer_path="/path/to/tokenizer.model", + metadata_path="/path/to/custom/metadata.json", +) +``` + +### Loading with Inline Metadata + +Pass metadata as a dictionary: + +```python +tokenizer = MegatronTokenizer.from_pretrained( + tokenizer_path="GPT2BPETokenizer", + metadata_path={"library": "megatron"}, + vocab_file="/path/to/vocab.txt", +) +``` + +## Advanced Usage + +### Custom Tokenizer Classes + +Create model-specific tokenization logic: + +```python +from megatron.core.tokenizers.text import MegatronTokenizerText + +class CustomTokenizer(MegatronTokenizerText): + def encode(self, text): + # Custom encoding logic + return super().encode(text) + + def decode(self, tokens): + # Custom decoding logic + return super().decode(tokens) + +# Save metadata with custom class +MegatronTokenizer.write_metadata( + tokenizer_path="/path/to/tokenizer.model", + tokenizer_library="sentencepiece", + tokenizer_class=CustomTokenizer, +) +``` + +### TikToken Tokenizers + +Configure TikToken-based tokenizers: + +```python +tokenizer = MegatronTokenizer.from_pretrained( + tokenizer_path="/path/to/tokenizer/model.json", + metadata_path={"library": "tiktoken"}, + pattern="v2", + num_special_tokens=1000, +) +``` + +### Null Tokenizer + +The Null tokenizer is a lightweight, zero-I/O tokenizer that requires no model files. +It is useful in three scenarios: + +1. **Performance benchmarking** with `--mock-data` where real tokenization is unnecessary. +2. **Testing** in functional tests and CI pipelines where tokenizer model files may not + be available. The Null tokenizer removes the dependency on external files, making + tests self-contained and portable. +3. **Pretraining with pretokenized data** where all data is already tokenized into + `.bin`/`.idx` files. In this case the tokenizer is only needed for metadata + (`vocab_size`, `eod`, `pad`) — not for actual tokenization. Using the Null tokenizer + avoids redundant filesystem access at scale, which is particularly beneficial on + shared filesystems like Lustre where thousands of ranks would otherwise all load the + same tokenizer files. + +Properties derived from `--vocab-size N`: +- `vocab_size` = `N` (the exact value passed) +- `eod` = `N - 1` (last token in the vocabulary) +- `pad` = `0` + +```python +tokenizer = MegatronTokenizer.from_pretrained( + metadata_path={"library": "null-text"}, + vocab_size=131072, +) +``` + +## Integration with Megatron-LM + +### Using with Training Scripts + +The tokenizer system works with Megatron-LM training scripts: + +```bash +# Null tokenizer for benchmarking with mock data +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tokenizer-type NullTokenizer \ + --vocab-size 131072 \ + --mock-data \ + ... +``` + +```bash +# Null tokenizer for pretraining with pretokenized data (no tokenizer files needed) +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tokenizer-type NullTokenizer \ + --vocab-size 128256 \ + --data-path /path/to/pretokenized_data \ + ... +``` + +```bash +# Hugging Face tokenizer with metadata +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model meta-llama/Meta-Llama-3-8B \ + --tokenizer-metadata /path/to/metadata.json \ + ... +``` + +### Auto-Generated Metadata + +If `--tokenizer-metadata` is not specified, a default metadata file is generated automatically based on the tokenizer type. + +## Supported Tokenizer Libraries + +The following table lists supported tokenizer backends: + +| Library | Description | Use Case | +|---------|-------------|----------| +| **Hugging Face** | Transformers tokenizers | Most modern LLMs, such as LLaMA and Mistral | +| **SentencePiece** | Google's tokenizer | GPT-style models, custom vocabularies | +| **TikToken** | OpenAI's tokenizer | GPT-3.5/GPT-4 style tokenization | +| **Megatron** | Built-in tokenizers | Legacy GPT-2 BPE | +| **Null** | Zero-I/O tokenizer | Benchmarking, pretokenized data | + +## Common Tokenizer Types + +### LLaMA / Mistral + +```python +MegatronTokenizer.write_metadata( + tokenizer_path="/path/to/llama/tokenizer.model", + tokenizer_library="sentencepiece", +) +``` + +### GPT-2 + +```python +MegatronTokenizer.write_metadata( + tokenizer_path="GPT2BPETokenizer", + tokenizer_library="megatron", + vocab_file="/path/to/gpt2-vocab.json", + merge_file="/path/to/gpt2-merges.txt", +) +``` + +## Recommendations + +1. **Save metadata** - Create metadata once, then reuse across training runs +2. **Prefer Hugging Face tokenizers** - When the model ships one, it reduces integration work +3. **Test tokenization** - Verify encode and decode before long training jobs +4. **Version control metadata** - Track `tokenizer_metadata.json` with experiment configs +5. **Share tokenizer directories** - Ship model files and metadata together for reproducibility + +## Next Steps + +- **Prepare data**: Refer to [Data Preparation](../data-preparation.md) for preprocessing with tokenizers +- **Train models**: Refer to [Training Examples](../training-examples.md) +- **Supported models**: Refer to [Language Models](../../models/llms.md) for model-specific tokenizers diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md new file mode 100644 index 00000000000..2a7ee2eeab9 --- /dev/null +++ b/docs/user-guide/index.md @@ -0,0 +1,26 @@ +--- +orphan: true +--- + + + +# User Guide + +Guides for using Megatron Core and Megatron-LM. + +```{toctree} +:maxdepth: 2 + +msc_integration +data-preparation +training-examples +parallelism-guide +features/index +``` diff --git a/docs/user-guide/msc_integration.md b/docs/user-guide/msc_integration.md new file mode 100644 index 00000000000..a197f25afc1 --- /dev/null +++ b/docs/user-guide/msc_integration.md @@ -0,0 +1,12 @@ + + +```{include} ../../megatron/core/MSC_Integration.md +``` + diff --git a/docs/user-guide/parallelism-guide.md b/docs/user-guide/parallelism-guide.md new file mode 100644 index 00000000000..2540ca0a827 --- /dev/null +++ b/docs/user-guide/parallelism-guide.md @@ -0,0 +1,250 @@ + + +# Parallelism Strategies Guide + +Megatron Core supports multiple parallelism strategies that can be combined to efficiently train models from billions to trillions of parameters across thousands of GPUs. + +## Overview + +The following table summarizes supported parallelism strategies. + +| Strategy | Parallelism Objective | Best For | +|----------|---------------------|----------| +| **Data Parallelism (DP)** | Batch Dimension | Data Scalability, Standard Training | +| **Tensor Parallelism (TP)** | Individual Layers | Large Layers & Activation, GPU Memory Constraints | +| **Pipeline Parallelism (PP)** | Model Depth | Very Deep Models | +| **Context Parallelism (CP)** | Sequence Length | Long Sequences (8K+ Tokens) | +| **Expert Parallelism (EP)** | MoE Experts | Mixture-of-Experts Models | +| **Fully-Sharded Data Parallelism (Megatron-FSDP)** | Model State | Extremely Large Models & DP Interchangeability | + +## Data Parallelism (DP) + +### Standard Distributed Data Parallel (DDP) + +Replicate the model across GPUs and split the batch. + +```bash +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --data-parallel-sharding-strategy no_shard +``` + +Each GPU has a full copy of the model and processes a portion of the batch. + +### Megatron Fully-Sharded Data Parallel (Megatron-FSDP) + +Shard model parameters, gradients, and optimizer states across GPUs to reduce memory utilization. + +``` +--use-megatron-fsdp +--data-parallel-sharding-strategy optim_grads_params +--ckpt-format fsdp_dtensor +--init-model-with-meta-device +``` + +**Sharding Strategies** + +`--data-parallel-sharding-strategy` supports the following options: + +- `optim` - Shard optimizer states only (ZeRO-1) +- `optim_grads` - Shard gradients + optimizer (ZeRO-2) +- `optim_grads_params` - Shard parameters + gradients + optimizer (ZeRO-3) + +If `--num-distributed-optimizer-instances` is > 1, then hierarchical data parallelism is enabled. + +`--outer-dp-sharding-strategy` supports the following options: + +- `no_shard` (**Hybrid-Sharded Data Parallelism**) - Replicate the model state across outer data parallel ranks. +- `optim` (**Hybrid-FSDP**) - Shard the optimizer state across the outer data parallel ranks. + - Requires `--data-parallel-sharding-strategy optim_grads_params`. + +**When to Use** + +- Large models with large or fused compute kernels to hide communications under. +- Integrated with TP, CP, EP, and easily composable with heterogeneous parallelisms. +- With SM-reducing optimizations from NCCL and activation offloading from TransformerEngine. +- Using `fully_shard` without depending on Megatron-LM. + +## Tensor Parallelism (TP) + +Split individual model layers across GPUs. Recommended for large hidden dimensions. + +```bash +--tensor-model-parallel-size 4 # 4-way tensor parallelism +--sequence-parallel # Enable sequence parallelism (recommended) +``` + +**When to Use** + +- Model layers do not fit on a single GPU +- Large hidden dimensions (4096+) +- Usually combined with DP and PP + +## Pipeline Parallelism (PP) + +Split model layers across GPUs vertically (by depth). + +```bash +--pipeline-model-parallel-size 8 # 8 pipeline stages +--num-layers-per-virtual-pipeline-stage 4 # Virtual pipeline for load balancing +``` + +**When to Use** + +- Very deep models (50+ layers) +- Combine with TP for large models +- Helps distribute memory across GPUs + +## Context Parallelism (CP) + +Split long sequences across GPUs for efficient long-context training. + +```bash +--context-parallel-size 2 # 2-way context parallelism +--cp-comm-type p2p # Communication type +``` + +**When to Use** + +- Long sequences (8K+ tokens) +- Reduces activation memory +- Can combine with TP, PP, DP + +Refer to [Context Parallelism Deep Dive](features/context_parallel.md) for a detailed guide with performance analysis. + +## Expert Parallelism (EP) + +Distribute experts across GPUs in Mixture-of-Experts models. + +```bash +--expert-model-parallel-size 8 # 8-way expert parallelism +--num-experts 64 # 64 experts per MoE layer +--moe-grouped-gemm # Optimize expert computation +``` + +**Important:** When combining EP with TP, you **must enable Sequence Parallelism**: + +```bash +--tensor-model-parallel-size 4 +--expert-model-parallel-size 8 +--sequence-parallel # Required when using TP + EP +``` + +## Parallelism Selection Guide + +For a list of supported configurations, refer to [Megatron Bridge Supported Models](https://github.com/NVIDIA-NeMo/Megatron-Bridge#supported-models). + +### Language Models + +Recommended language model configurations: + +| Model | Size | GPUs | TP | PP | CP | EP | Configuration Notes | +|-------|------|------|----|----|----|----|---------------------| +| **LLaMA-3** | 8B | 8 | 1 | 1 | 2 | 1 | CP=2 for long context (8K seqlen) | +| **LLaMA-3** | 70B | 64 | 4 | 4 | 2 | 1 | Balanced TP+PP for 70B scale | +| **LLaMA-3.1** | 405B | 1024 | 8 | 8 | 2 | 1 | 3D parallelism (TP+PP+CP) | +| **GPT-3** | 175B | 128-512 | 4 | 8 | 1 | 1 | Standard large model config | + +### Mixture-of-Experts Models + +Recommended mixture-of-experts configurations: + +| Model | Size | GPUs | TP | PP | CP | EP | Configuration Notes | +|-------|------|------|----|----|----|----|---------------------| +| **Mixtral** | 8x7B | 64 | 1 | 4 | 1 | 8 | EP=8 for 8 experts | +| **Mixtral** | 8x22B | 256 | 4 | 4 | 1 | 8 | TP+PP+EP for large MoE | +| **DeepSeek-V3** | 671B | 1024 | 2 | 16 | 1 | 64 | Massive MoE with 256 experts | + +## Combining Strategies + +### Total GPU Count + +The total number of GPUs is calculated as: + +``` +Total GPUs = TP × PP × CP × EP × DP +``` + +### Example: LLaMA-3 70B on 64 GPUs + +```bash +# TP=4, PP=4, CP=2, DP=2 => 4 × 4 × 2 × 2 = 64 GPUs +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 4 \ + --context-parallel-size 2 \ + --num-layers 80 \ + --hidden-size 8192 \ + --num-attention-heads 64 \ + --seq-length 8192 \ + --micro-batch-size 1 \ + --global-batch-size 512 \ + --bf16 +``` + +## Performance Optimizations + +### Communication Overlap + +Enable overlapping of communication with computation: + +```bash +--overlap-grad-reduce # Overlap gradient reduction with backward pass +--overlap-param-gather # Overlap parameter gathering with forward pass +--tp-comm-overlap # Overlap TP communication +``` + +### Distributed Optimizer + +Recommended for all multi-GPU training: + +```bash +--use-distributed-optimizer +``` + +**Benefits** + +- Faster checkpointing +- Reduced memory when combined with FSDP +- Better performance at scale + +### Sequence Parallelism + +Always enable when using TP: + +```bash +--sequence-parallel +``` + +Reduces activation memory by sharding sequence dimension in LayerNorm and Dropout. + +## Choosing the Right Strategy + +### Start Simple +1. Begin with **Data Parallelism** (DP) only. +2. Add **Tensor Parallelism** (TP) if the model does not fit. +3. Add **Pipeline Parallelism** (PP) for very large models. +4. Add **Context Parallelism** (CP) for long sequences. + +### Memory Constraints +- Use **FSDP** to split model state per GPU. +- Use **TP** to split large layers. +- Use **PP** to split model depth. +- Enable **activation checkpointing or offloading** for extreme cases. + +### Communication Bottlenecks +- Reduce **TP** degree (increases memory per GPU). +- Increase **PP** degree (may reduce efficiency). +- Use **CP** instead of larger TP for long sequences. + +## Next Steps + +- **API Reference**: Refer to [Tensor Parallel](../api-guide/core/tensor_parallel.md) and [Pipeline Parallel](../api-guide/core/pipeline_parallel.md) in the API documentation +- **Advanced Features**: Refer to [Megatron-FSDP](features/megatron_fsdp.md), [MoE](features/moe.md), and [Distributed Optimizer](features/dist_optimizer.md) +- **Performance Tuning**: Refer to the [NVIDIA NeMo Performance Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html) diff --git a/docs/user-guide/training-examples.md b/docs/user-guide/training-examples.md new file mode 100644 index 00000000000..ca8b182adc7 --- /dev/null +++ b/docs/user-guide/training-examples.md @@ -0,0 +1,159 @@ + + +# Training Examples + +Get started with Megatron Core training using these practical examples. + +## Basic Training Example + +Use the basic training loop with mock data to get started: + +```bash +# Distributed training on 2 GPUs with mock data +torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py +``` + +This example: + +- Runs on two GPUs +- Uses generated mock data (no data preparation needed) +- Demonstrates basic distributed training setup +- Provides a quick way to verify your installation + +## LLaMA-3 Training Examples + +### LLaMA-3 8B with FP8 + +Train the LLaMA-3 8B model with FP8 mixed precision on eight GPUs: + +```bash +./examples/llama/train_llama3_8b_h100_fp8.sh +``` + +**Configuration** + +- Eight GPUs +- FP8 mixed precision (requires Hopper/Ada/Blackwell GPUs) +- Mock data for quick testing + +### Custom LLaMA Training + +For training with your own data: + +```bash +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 4 \ + --global-batch-size 32 \ + --train-iters 100000 \ + --lr 3.0e-4 \ + --min-lr 3.0e-5 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2000 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --bf16 \ + --data-path /path/to/your/preprocessed_data \ + --split 949,50,1 \ + --save /path/to/checkpoints \ + --load /path/to/checkpoints \ + --log-interval 10 \ + --save-interval 1000 \ + --eval-interval 1000 +``` + +## GPT-3 Training Example + +Train a GPT-3 style model: + +```bash +torchrun --nproc_per_node=8 pretrain_gpt.py \ + --tensor-model-parallel-size 2 \ + --pipeline-model-parallel-size 2 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --micro-batch-size 2 \ + --global-batch-size 16 \ + --train-iters 100000 \ + --lr 1.5e-4 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --lr-warmup-iters 1000 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --fp16 \ + --data-path /path/to/preprocessed_data \ + --split 949,50,1 \ + --save /path/to/checkpoints \ + --load /path/to/checkpoints +``` + +## Key Training Arguments + +The following tables group common training arguments by category. + +### Model Architecture + +| Argument | Description | +|----------|-------------| +| `--num-layers` | Number of transformer layers | +| `--hidden-size` | Hidden dimension size | +| `--num-attention-heads` | Number of attention heads | +| `--seq-length` | Sequence length for training | + +### Training Configuration + +| Argument | Description | +|----------|-------------| +| `--micro-batch-size` | Batch size per GPU | +| `--global-batch-size` | Total batch size across all GPUs | +| `--train-iters` | Number of training iterations | + +### Learning Rate + +| Argument | Description | +|----------|-------------| +| `--lr` | Peak learning rate | +| `--min-lr` | Minimum learning rate | +| `--lr-decay-style` | LR schedule (cosine, linear, constant) | +| `--lr-warmup-iters` | Warmup iterations | + +### Mixed Precision + +| Argument | Description | +|----------|-------------| +| `--fp16` | FP16 mixed precision | +| `--bf16` | BF16 mixed precision (recommended) | +| `--fp8-hybrid` | FP8 mixed precision (Hopper/Ada/Blackwell) | + +### Data and Checkpointing + +| Argument | Description | +|----------|-------------| +| `--data-path` | Path to preprocessed data | +| `--split` | Train/validation/test split (for example, 949,50,1) | +| `--save` | Checkpoint save directory | +| `--load` | Checkpoint load directory | +| `--save-interval` | Save checkpoint every N iterations | + +## Next Steps + +- **Optimize Performance**: Refer to [Advanced Features](features/index.md) for FSDP, the distributed optimizer, and other optimizations +- **Scale Up**: Refer to [Parallelism Strategies](parallelism-guide.md) to train larger models across more GPUs +- **Prepare Data**: Follow the [Data Preparation](data-preparation.md) guide to process your own datasets diff --git a/docs/versions1.json b/docs/versions1.json new file mode 100644 index 00000000000..b0bc489fa71 --- /dev/null +++ b/docs/versions1.json @@ -0,0 +1,22 @@ +[ + { + "name": "nightly", + "version": "nightly", + "url": "https://docs.nvidia.com/megatron-core/developer-guide/nightly/" + }, + { + "name": "0.17.0 (latest)", + "version": "0.17.0", + "url": "https://docs.nvidia.com/megatron-core/developer-guide/latest/" + }, + { + "name": "0.16.0", + "version": "0.16.0", + "url": "https://docs.nvidia.com/megatron-core/developer-guide/0.16.0/" + }, + { + "name": "0.15.0", + "version": "0.15.0", + "url": "https://docs.nvidia.com/megatron-core/developer-guide/0.15.0/" + } +] diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 00000000000..0519ecba6ea --- /dev/null +++ b/examples/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md similarity index 100% rename from examples/detxoify_lm/README.md rename to examples/academic_paper_scripts/detxoify_lm/README.md diff --git a/examples/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py similarity index 100% rename from examples/detxoify_lm/annotations/filter-selfgeneration.py rename to examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py diff --git a/examples/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py similarity index 98% rename from examples/detxoify_lm/annotations/perspective_api_annotate.py rename to examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py index fd82c2a2ae9..9736db099a4 100644 --- a/examples/detxoify_lm/annotations/perspective_api_annotate.py +++ b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py @@ -107,7 +107,7 @@ def get_score(line): except UnicodeDecodeError: try: decoded_text = encoded_text[:20476].decode('utf8') - except: + except Exception: print("Error occurred") data['score'] = None return json.dumps(data) @@ -138,7 +138,7 @@ def get_scores(lines): except UnicodeDecodeError: try: decoded_text = encoded_text[:20476].decode('utf8') - except: + except Exception: print("Error occurred") data['score'] = None all_data.append(json.dumps(data)) diff --git a/examples/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh similarity index 100% rename from examples/detxoify_lm/annotations/preprocess.sh rename to examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh diff --git a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh similarity index 98% rename from examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh rename to examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh index 62a36c0b79e..a212fbdf3f6 100755 --- a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh +++ b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh @@ -43,7 +43,6 @@ python -m torch.distributed.run $DISTRIBUTED_ARGS \ --data-path2 ${DATA_BLEND} \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ - --data-impl mmap \ --split 100,0,0 \ --distributed-backend nccl \ --lr-decay-style constant \ diff --git a/examples/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh similarity index 100% rename from examples/detxoify_lm/generate-1.3b.sh rename to examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py new file mode 100644 index 00000000000..2a2b1d63a21 --- /dev/null +++ b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py @@ -0,0 +1,249 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + + +"""Sample Generate GPT""" +import json +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir, os.path.pardir))) +import torch +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training.checkpointing import load_checkpoint +from megatron.core import mpu +from megatron.training.arguments import parse_and_validate_args +from megatron.training.initialize import initialize_megatron +from megatron.training import get_model +from megatron.inference.text_generation import generate_and_post_process +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.models.gpt import GPTModel +from typing import Union +from megatron.core.transformer.spec_utils import import_module +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec + +def model_provider(pre_process=True, post_process=True) -> GPTModel: + """Builds the model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + GPTModel: The returned model + """ + args = get_args() + + print_rank_0('building GPT model ...') + config = core_transformer_config_from_args(args) + + if args.spec is None: + if args.transformer_impl == 'local': + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + elif args.transformer_impl == 'transformer_engine': + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + else: + raise ValueError(f"Invalid transformer_impl {args.transformer_impl}") + elif args.spec[0] == 'local': + transformer_layer_spec = get_gpt_layer_local_spec( + num_experts=args.num_experts, + moe_grouped_gemm=args.moe_grouped_gemm + ) + else: + transformer_layer_spec = import_module(args.spec) + + model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=False, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent + ) + + return model + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, + help='Sampling temperature.') + group.add_argument("--greedy", action='store_true', default=False, + help='Use greedy sampling.') + group.add_argument("--top_p", type=float, default=0.0, + help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, + help='Top k sampling.') + group.add_argument("--out-seq-length", type=int, default=1024, + help='Size of the output generated text.') + group.add_argument("--sample-input-file", type=str, default=None, + help='Get input from file instead of interactive mode, ' + 'each line is an input.') + group.add_argument("--sample-output-file", type=str, default=None, + help='Output file got from --sample-input-file') + group.add_argument("--num-samples", type=int, default=0, + help='Number of samples to generate unconditionally, ' + 'defaults to 0 and interactive conditional sampling') + group.add_argument("--genfile", type=str, + help='Output file when generating unconditionally') + return parser + +def generate_samples_unconditional(model): + args = get_args() + + if torch.distributed.get_rank() == 0: + cnt = 0 + num_samples = args.num_samples + from tqdm import tqdm + pbar = tqdm(total=num_samples) + + while True: + if torch.distributed.get_rank() == 0: + sentences = [''] * args.global_batch_size + print("global batch size", args.global_batch_size) + max_len = args.out_seq_length + resp_sentences, resp_sentences_seg, output_logits, \ + tokens = generate_and_post_process(model, prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=True, + temperature=1.0) + for prompt, generation, token in zip(sentences, resp_sentences, tokens): + datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + yield datum + cnt += 1 + pbar.update() + if cnt >= num_samples: + break + + if cnt >= num_samples: + pbar.close() + break + else: + generate_and_post_process(model) + + +def generate_samples_conditional(model): + args = get_args() + + if torch.distributed.get_rank() == 0: + num_samples = args.num_samples + cnt = 0 + from tqdm import tqdm + pbar = tqdm(total=num_samples) + + fname = open(args.sample_input_file, "r") + lines = fname.readlines() + all_raw_text = [json.loads(line)['prompt']['text'] for line in lines] + input_count = len(all_raw_text) + input_pos = 0 + + while True: + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + sentences = [] + print("global batch size", args.global_batch_size) + for _ in range(args.global_batch_size): + if input_pos >= input_count: + print(f"input pos: {input_pos}, input count: {input_count}") + raw_text = "EMPTY TEXT" + else: + raw_text = all_raw_text[input_pos] + input_pos += 1 + sentences.append(raw_text) + + max_len = args.out_seq_length + resp_sentences, resp_sentences_seg, output_logits, \ + tokens = generate_and_post_process(model, prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=1.0) + for prompt, generation, token in zip(sentences, resp_sentences, tokens): + datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + yield datum + cnt += 1 + pbar.update() + if cnt >= num_samples: + break + + if cnt >= num_samples: + pbar.close() + break + else: + generate_and_post_process(model) + + +def generate_and_write_samples_unconditional(model): + args = get_args() + assert args.genfile is not None + with open(args.genfile, 'w') as f: + for datum in generate_samples_unconditional(model): + if torch.distributed.get_rank() == 0: + f.write(json.dumps(datum) + '\n') + + +def generate_and_write_samples_conditional(model): + args = get_args() + if args.sample_output_file is None: + sample_output_file = args.sample_input_file + ".out" + print('`sample-output-file` not specified, setting ' + 'it to {}'.format(sample_output_file)) + else: + sample_output_file = args.sample_output_file + with open(sample_output_file, 'w') as f: + for datum in generate_samples_conditional(model): + if torch.distributed.get_rank() == 0: + f.write(json.dumps(datum) + '\n') + + +def main(): + """Main program.""" + + parse_and_validate_args(extra_args_provider=add_text_generate_args, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True, + 'seq_length': 2048}) + initialize_megatron() + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + + args = get_args() + + if args.load is not None: + _ = load_checkpoint(model, None, None) + model = model[0] + + # Generate samples. + if args.sample_input_file != None: + print(f"{args.sample_input_file}") + generate_and_write_samples_conditional(model) + else: + generate_and_write_samples_unconditional(model) + + +if __name__ == "__main__": + + main() diff --git a/examples/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py similarity index 100% rename from examples/detxoify_lm/perspective_api.py rename to examples/academic_paper_scripts/detxoify_lm/perspective_api.py diff --git a/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh similarity index 100% rename from examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh rename to examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh diff --git a/examples/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md similarity index 100% rename from examples/msdp/README.md rename to examples/academic_paper_scripts/msdp/README.md diff --git a/examples/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh similarity index 100% rename from examples/msdp/data_processing.sh rename to examples/academic_paper_scripts/msdp/data_processing.sh diff --git a/examples/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh similarity index 100% rename from examples/msdp/eval_knwl_generation.sh rename to examples/academic_paper_scripts/msdp/eval_knwl_generation.sh diff --git a/examples/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh similarity index 100% rename from examples/msdp/eval_resp_generation.sh rename to examples/academic_paper_scripts/msdp/eval_resp_generation.sh diff --git a/examples/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh similarity index 100% rename from examples/msdp/prep_resp_gen.sh rename to examples/academic_paper_scripts/msdp/prep_resp_gen.sh diff --git a/examples/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh similarity index 100% rename from examples/msdp/prompt_knwl_gen.sh rename to examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh diff --git a/examples/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh similarity index 100% rename from examples/msdp/prompt_resp_gen.sh rename to examples/academic_paper_scripts/msdp/prompt_resp_gen.sh diff --git a/examples/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh similarity index 100% rename from examples/sc21/CONFIG.sh rename to examples/academic_paper_scripts/sc21/CONFIG.sh diff --git a/examples/academic_paper_scripts/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md new file mode 100644 index 00000000000..ec922d153d6 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/README.md @@ -0,0 +1,50 @@ +# Reproducing Figures in SC21 Paper + + +This directory contains some of the scripts that were used to produce the +results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is +to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These +scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the +[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other +schedulers as well. + + +## Git commit + +To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e + + +## Setup + +All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please +update the unspecified values (in angle brackets `<...>`) before launching any +scripts. + + + +## Scripts + +Below is a list of scripts that can be used to reproduce various figures in our +[paper](https://arxiv.org/pdf/2104.04473.pdf): + +* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput +for GPT models ranging from 1 billion to 1 trillion parameters. +* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling +performance of pipeline parallelism. +* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of +the interleaved schedule on a 175B GPT model. +* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of +different degrees of pipeline and tensor model parallelism on a model with +162.2 billion parameters. +* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of +different degrees of data and pipeline model parallelism on a model with +5.9 billion parameters. +* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of +different degrees of data and tensor model parallelism on a model with +5.9 billion parameters. +* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of +microbatch size. +* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of +activation recomputation. +* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of +the scatter-gather communication optimization. diff --git a/examples/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh similarity index 100% rename from examples/sc21/SBATCH.sh rename to examples/academic_paper_scripts/sc21/SBATCH.sh diff --git a/examples/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh similarity index 100% rename from examples/sc21/SRUN.sh rename to examples/academic_paper_scripts/sc21/SRUN.sh diff --git a/examples/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh similarity index 100% rename from examples/sc21/run_figure_11.sh rename to examples/academic_paper_scripts/sc21/run_figure_11.sh diff --git a/examples/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh similarity index 100% rename from examples/sc21/run_figure_12.sh rename to examples/academic_paper_scripts/sc21/run_figure_12.sh diff --git a/examples/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh similarity index 100% rename from examples/sc21/run_figure_13.sh rename to examples/academic_paper_scripts/sc21/run_figure_13.sh diff --git a/examples/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh similarity index 100% rename from examples/sc21/run_figure_14.sh rename to examples/academic_paper_scripts/sc21/run_figure_14.sh diff --git a/examples/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh similarity index 100% rename from examples/sc21/run_figure_15.sh rename to examples/academic_paper_scripts/sc21/run_figure_15.sh diff --git a/examples/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh similarity index 100% rename from examples/sc21/run_figure_16.sh rename to examples/academic_paper_scripts/sc21/run_figure_16.sh diff --git a/examples/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh similarity index 100% rename from examples/sc21/run_figure_17.sh rename to examples/academic_paper_scripts/sc21/run_figure_17.sh diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh new file mode 100755 index 00000000000..c9e254200a1 --- /dev/null +++ b/examples/academic_paper_scripts/sc21/run_figure_18.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Batch size (global batch size) options = [12, 24, 36, ..., 60]. +GBS=12 + + + +MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " + + +# Other params. +TP=8 +PP=12 +MBS=1 +NLS=96 +HS=12288 +NAH=96 +DDP=local +NNODES=12 + + +# Name of the job. +export JOB_NAME=results_figure_18_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh similarity index 100% rename from examples/sc21/run_table_1.sh rename to examples/academic_paper_scripts/sc21/run_table_1.sh diff --git a/examples/bert/README.md b/examples/bert/README.md new file mode 100644 index 00000000000..6c1fe95bf06 --- /dev/null +++ b/examples/bert/README.md @@ -0,0 +1,53 @@ +# BERT MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Configurations](#2-configurations) + +## 1. Training setup + + +To run the model using a docker container run it as follows +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# +VOCAB_FILE="" #//bert-vocab.txt +DATA_PATH="" #_text_document + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ + bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH " + +``` +NOTE: Depending on the environment you are running it the above command might like slightly different. + + +## 2. Configurations + +The example in this folder shows you how to run 340m large model. There are other configs you could run as well + +### 4B +``` + --num-layers 48 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + +### 20B +``` + --num-layers 48 \ + --hidden-size 6144 \ + --num-attention-heads 96 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 4 \ + +``` \ No newline at end of file diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh new file mode 100644 index 00000000000..f0d9c87c8bf --- /dev/null +++ b/examples/bert/train_bert_340m_distributed.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# Runs the "340M" parameter model (Bert - Large) + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_LOGS_PATH=$2 # +VOCAB_FILE=$3 #/bert-vocab.json +DATA_PATH=$4 #_text_document + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +BERT_MODEL_ARGS=( + --num-layers 24 + --hidden-size 1024 + --num-attention-heads 16 + --seq-length 512 + --max-position-embeddings 512 + --attention-backend auto # Can use (flash/fused/unfused/local) +) + +TRAINING_ARGS=( + --micro-batch-size 4 + --global-batch-size 32 + --train-iters 1000000 + --weight-decay 1e-2 + --clip-grad 1.0 + --fp16 + --lr 0.0001 + --lr-decay-iters 990000 + --lr-decay-style linear + --min-lr 1.0e-5 + --weight-decay 1e-2 + --lr-warmup-fraction .01 + --clip-grad 1.0 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 16 +) + +DATA_ARGS=( + --data-path $DATA_PATH + --vocab-file $VOCAB_FILE + --split 949,50,1 +) + +EVAL_AND_LOGGING_ARGS=( + --log-interval 100 + --save-interval 10000 + --eval-interval 1000 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --eval-iters 10 + --tensorboard-dir $TENSORBOARD_LOGS_PATH +) + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \ + ${BERT_MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} + \ No newline at end of file diff --git a/examples/detxoify_lm/finetune_gpt.py b/examples/detxoify_lm/finetune_gpt.py deleted file mode 100644 index 70b781e0eee..00000000000 --- a/examples/detxoify_lm/finetune_gpt.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - - -"""Fine-tune GPT""" - -import torch -from functools import partial -import os -import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), - os.path.pardir, os.path.pardir))) -from megatron import get_args -from megatron import get_timers -from megatron import get_tokenizer -from megatron import print_rank_0 -from megatron.core import mpu -from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.gpt_dataset import build_train_valid_test_datasets -from megatron.model import GPTModel -from megatron.core.enums import ModelType -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids -from megatron.utils import average_losses_across_data_parallel_group - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process - ) - return model - - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = ['text'] - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - - print_rank_0('> building train, validation, and test datasets ' - 'for GPT ...') - train_ds, valid_ds1, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup)) - print_rank_0("> finished creating finetuning GPT datasets ...") - - _, valid_ds, _ = build_train_valid_test_datasets( - data_prefix=args.data_path2, - data_impl="mmap", - splits_string="98,2,0", - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=2048, - seed=1234, - skip_warmup=(not args.mmap_warmup)) - print_rank_0("> finished creating pretrained GPT datasets ...") - - return train_ds, valid_ds, test_ds - - -def add_validation_args(parser): - """Text generation arguments.""" - group = parser.add_argument_group(title='validation set') - group.add_argument('--data-path2', nargs='*', default=None, - help='Path to the validation dataset. Accepted format:' - '1) a single data path, 2) multiple datasets in the' - 'form: dataset1-weight dataset1-path dataset2-weight ' - 'dataset2-path ...') - group.add_argument('--eval-ppl', action='store_true', default=False) - group.add_argument('--stored_params', type=dict, default=dict()) - return parser - - -if __name__ == "__main__": - - pretrain(train_valid_test_datasets_provider, model_provider, - ModelType.encoder_or_decoder, - forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - extra_args_provider=add_validation_args,) diff --git a/examples/detxoify_lm/generate_samples_gpt.py b/examples/detxoify_lm/generate_samples_gpt.py deleted file mode 100644 index 47e1590ea56..00000000000 --- a/examples/detxoify_lm/generate_samples_gpt.py +++ /dev/null @@ -1,199 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - - -"""Sample Generate GPT""" -import json -import os -import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), - os.path.pardir, os.path.pardir))) -import torch -from megatron import get_args -from megatron import get_tokenizer -from megatron import print_rank_0 -from megatron.checkpointing import load_checkpoint -from megatron.core import mpu -from megatron.initialize import initialize_megatron -from megatron.model import GPTModel -from megatron.training import get_model -from megatron.text_generation import generate_and_post_process - - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - model = GPTModel(num_tokentypes=0, parallel_output=False, - pre_process=pre_process, post_process=post_process) - - return model - -def add_text_generate_args(parser): - """Text generation arguments.""" - group = parser.add_argument_group(title='text generation') - - group.add_argument("--temperature", type=float, default=1.0, - help='Sampling temperature.') - group.add_argument("--greedy", action='store_true', default=False, - help='Use greedy sampling.') - group.add_argument("--top_p", type=float, default=0.0, - help='Top p sampling.') - group.add_argument("--top_k", type=int, default=0, - help='Top k sampling.') - group.add_argument("--out-seq-length", type=int, default=1024, - help='Size of the output generated text.') - group.add_argument("--sample-input-file", type=str, default=None, - help='Get input from file instead of interactive mode, ' - 'each line is an input.') - group.add_argument("--sample-output-file", type=str, default=None, - help='Output file got from --sample-input-file') - group.add_argument("--num-samples", type=int, default=0, - help='Number of samples to generate unconditionally, ' - 'defaults to 0 and interactive conditional sampling') - group.add_argument("--genfile", type=str, - help='Output file when generating unconditionally') - return parser - -def generate_samples_unconditional(model): - args = get_args() - - if torch.distributed.get_rank() == 0: - cnt = 0 - num_samples = args.num_samples - from tqdm import tqdm - pbar = tqdm(total=num_samples) - - while True: - if torch.distributed.get_rank() == 0: - sentences = [''] * args.global_batch_size - print("global batch size", args.global_batch_size) - max_len = args.out_seq_length - resp_sentences, resp_sentences_seg, output_logits, \ - tokens = generate_and_post_process(model, prompts=sentences, - tokens_to_generate=max_len, - return_output_log_probs=False, - top_k_sampling=args.top_k, - top_p_sampling=args.top_p, - add_BOS=True, - temperature=1.0) - for prompt, generation, token in zip(sentences, resp_sentences, tokens): - datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} - yield datum - cnt += 1 - pbar.update() - if cnt >= num_samples: - break - - if cnt >= num_samples: - pbar.close() - break - else: - generate_and_post_process(model) - - -def generate_samples_conditional(model): - args = get_args() - - if torch.distributed.get_rank() == 0: - num_samples = args.num_samples - cnt = 0 - from tqdm import tqdm - pbar = tqdm(total=num_samples) - - fname = open(args.sample_input_file, "r") - lines = fname.readlines() - all_raw_text = [json.loads(line)['prompt']['text'] for line in lines] - input_count = len(all_raw_text) - input_pos = 0 - - while True: - torch.distributed.barrier() - if torch.distributed.get_rank() == 0: - sentences = [] - print("global batch size", args.global_batch_size) - for _ in range(args.global_batch_size): - if input_pos >= input_count: - print(f"input pos: {input_pos}, input count: {input_count}") - raw_text = "EMPTY TEXT" - else: - raw_text = all_raw_text[input_pos] - input_pos += 1 - sentences.append(raw_text) - - max_len = args.out_seq_length - resp_sentences, resp_sentences_seg, output_logits, \ - tokens = generate_and_post_process(model, prompts=sentences, - tokens_to_generate=max_len, - return_output_log_probs=False, - top_k_sampling=args.top_k, - top_p_sampling=args.top_p, - add_BOS=False, - temperature=1.0) - for prompt, generation, token in zip(sentences, resp_sentences, tokens): - datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} - yield datum - cnt += 1 - pbar.update() - if cnt >= num_samples: - break - - if cnt >= num_samples: - pbar.close() - break - else: - generate_and_post_process(model) - - -def generate_and_write_samples_unconditional(model): - args = get_args() - assert args.genfile is not None - with open(args.genfile, 'w') as f: - for datum in generate_samples_unconditional(model): - if torch.distributed.get_rank() == 0: - f.write(json.dumps(datum) + '\n') - - -def generate_and_write_samples_conditional(model): - args = get_args() - if args.sample_output_file is None: - sample_output_file = args.sample_input_file + ".out" - print('`sample-output-file` not specified, setting ' - 'it to {}'.format(sample_output_file)) - else: - sample_output_file = args.sample_output_file - with open(sample_output_file, 'w') as f: - for datum in generate_samples_conditional(model): - if torch.distributed.get_rank() == 0: - f.write(json.dumps(datum) + '\n') - - -def main(): - """Main program.""" - - initialize_megatron(extra_args_provider=add_text_generate_args, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer', - 'no_load_rng': True, - 'no_load_optim': True, - 'seq_length': 2048}) - - # Set up model and load checkpoint - model = get_model(model_provider, wrap_with_ddp=False) - - args = get_args() - - if args.load is not None: - _ = load_checkpoint(model, None, None) - model = model[0] - - # Generate samples. - if args.sample_input_file != None: - print(f"{args.sample_input_file}") - generate_and_write_samples_conditional(model) - else: - generate_and_write_samples_unconditional(model) - - -if __name__ == "__main__": - - main() diff --git a/examples/evaluate_retriever_nq.sh b/examples/evaluate_retriever_nq.sh deleted file mode 100644 index 16e937f4fd0..00000000000 --- a/examples/evaluate_retriever_nq.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# Evaluate natural question test data given Wikipedia embeddings and pretrained -# ICT model or a finetuned model for Natural Question task - -# Datasets can be downloaded from the following link: -# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py - -EVIDENCE_DATA_DIR= -EMBEDDING_PATH= -CHECKPOINT_PATH= - -QA_FILE= - -python tasks/main.py \ - --task RETRIEVER-EVAL \ - --tokenizer-type BertWordPieceLowerCase \ - --num-layers 12 \ - --hidden-size 768 \ - --num-attention-heads 12 \ - --tensor-model-parallel-size 1 \ - --micro-batch-size 128 \ - --activations-checkpoint-method uniform \ - --seq-length 512 \ - --max-position-embeddings 512 \ - --load ${CHECKPOINT_PATH} \ - --evidence-data-path ${EVIDENCE_DATA_DIR} \ - --embedding-path ${EMBEDDING_PATH} \ - --retriever-seq-length 256 \ - --vocab-file bert-vocab.txt\ - --qa-data-test ${QA_FILE} \ - --faiss-use-gpu \ - --retriever-report-topk-accuracies 1 5 20 100 \ - --fp16 \ - --indexer-log-interval 1000 \ - --indexer-batch-size 128 - - diff --git a/examples/evaluate_zeroshot_gpt.sh b/examples/evaluate_zeroshot_gpt.sh deleted file mode 100755 index f8c38dc01d4..00000000000 --- a/examples/evaluate_zeroshot_gpt.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -WORLD_SIZE=8 - -DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -TASK="LAMBADA" - -VALID_DATA= -VOCAB_FILE=gpt2-vocab.json -MERGE_FILE=gpt2-merges.txt -CHECKPOINT=checkpoints/gpt2_345m - - -python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ - --task $TASK \ - --valid-data $VALID_DATA \ - --tokenizer-type GPT2BPETokenizer \ - --strict-lambada \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --load $CHECKPOINT \ - --tensor-model-parallel-size 1 \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --batch-size 8 \ - --activations-checkpoint-method uniform \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --log-interval 10 \ - --fp16 \ - --no-load-optim \ - --no-load-rng diff --git a/examples/export/README.md b/examples/export/README.md new file mode 100644 index 00000000000..c9539e8ab21 --- /dev/null +++ b/examples/export/README.md @@ -0,0 +1,10 @@ +# Megatron Core Export + +This module is used to export megatron core models to different inference frameworks. +Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. + +## PTQ AND EXPORT +Follow the examples of [Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. + +# TRTLLM EXPORT +Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone. diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md new file mode 100644 index 00000000000..52cad785838 --- /dev/null +++ b/examples/export/trtllm_export/README.md @@ -0,0 +1,161 @@ +# Megatron Core To TRTLLM Export Documentation +This guide will walk you through how you can use the megatron core export for exporting models to trtllm format + +### Contents +- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation) +- [Contents](#contents) + - [1. Quick Start](#1-quick-start) + - [1.1 Understanding The Code](#11-understanding-the-code) + - [1.2 Running The Code](#12-running-the-code) + - [2. GPU Export](#2-gpu-export) + - [3. Future work](#4-future-work) + +#### 1. Quick Start +This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py) + +NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function. + +
+ +##### 1.1 Understanding The Code +***STEP 1 - We initialize model parallel and other default arguments*** +We initalize tp and pp to 1 so that we can get the full model state dict on cpu +```python + initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) +``` + +***STEP 2 - We load the model using the model_provider_function*** +NOTE: We create a simple gpt model + +```python + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, # Needs to be atleast 32 times num_attn_heads + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + # Optionally you can also load a model using this code + # sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + # gpt_model.load_state_dict(checkpoint) + +``` + +***STEP 3 - Instantiate the TRTLLM Helper*** +We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py) For the GPT model we instantiate trtllm_helper as shown below. +```python + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) +``` + +***STEP 4 - Get the TRTLLM Weights and configs*** +To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. + +```python + model_state_dict={} + for key , val in gpt_model.state_dict().items(): + # val is non for _extra_state layers . We filter it out + if val is not None: + model_state_dict[key] = val + + export_config = ExportConfig(inference_tp_size = 2) + weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= model_state_dict, + dtype = DataType.bfloat16, + export_config=export_config + ) +``` + +***STEP 5 - Build the TRTLLM Engine*** +Following code is used to build the TRTLLM Engine. + +```python + for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list): + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights, + trtllm_model_config=trtllm_model_config, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) +``` +
+ +##### 1.2 Running The Code +An example run script is shown below. + +``` +# In a workstation +MLM_PATH=/path/to/megatron-lm +CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86 + +docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash + +# Inside the container run the following. + +cd /opt/megatron-lm/ + +CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py +``` + +
+ +#### 2. GPU Export +You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. +In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. + +To run the gpu version + +``` +CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py +``` + +
+ +#### 3. Future work +The following are planned for the future releases . +* Pipeline parallellism for export (Work in progress) +* GPU Export for more models (Work in progress for some models) +* Refit functionality +* VLLM Support \ No newline at end of file diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py new file mode 100644 index 00000000000..57d44f9f628 --- /dev/null +++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py @@ -0,0 +1,117 @@ +import os +import torch +from megatron.core import parallel_state +from megatron.core import dist_checkpointing +from megatron.core.export.model_type import ModelType +from megatron.core.export.data_type import DataType +from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + + +_SEQUENCE_LENGTH = 64 +_VOCAB_SIZE = 256 + +def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1): + parallel_state.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size) + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32 + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=_VOCAB_SIZE, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + return gpt_model + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model + +if __name__ == "__main__": + initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + device = torch.device("cuda") + gpt_model.to(device) + + # Optionally you can also load a gpt model from ckpt_path using this code below + # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + + seq_len_interpolation_factor = None + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) + + + trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= gpt_model.state_dict(), + dtype = DataType.bfloat16, + on_device_distributed_conversion=True, + vocab_size=_VOCAB_SIZE, + gpus_per_node=2, + ) + + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights[0], + trtllm_model_config=trtllm_model_config[0], + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py new file mode 100644 index 00000000000..587e7cfdd32 --- /dev/null +++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py @@ -0,0 +1,118 @@ +import os +import torch +from megatron.core import parallel_state +from megatron.core import dist_checkpointing +from megatron.core.export.model_type import ModelType +from megatron.core.export.data_type import DataType +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + + +_SEQUENCE_LENGTH = 64 + + +def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1): + parallel_state.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size) + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, # Needs to be atleast 32 times num_attn_heads + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + return gpt_model + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model + +if __name__ == "__main__": + # Need to use TP1 PP1 for export on single device + initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + + # Optionally you can also load a gpt model from ckpt_path using this code below + # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + + seq_len_interpolation_factor = None + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) + + + export_config = ExportConfig(inference_tp_size = 2) + # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api + weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= gpt_model.state_dict(), + dtype = DataType.bfloat16, + export_config=export_config + ) + + for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list): + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights, + trtllm_model_config=trtllm_model_config, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) \ No newline at end of file diff --git a/examples/finetune_mnli_distributed.sh b/examples/finetune_mnli_distributed.sh deleted file mode 100755 index 9219e595dd2..00000000000 --- a/examples/finetune_mnli_distributed.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -WORLD_SIZE=8 - -DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -TRAIN_DATA="data/glue_data/MNLI/train.tsv" -VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ - data/glue_data/MNLI/dev_mismatched.tsv" -PRETRAINED_CHECKPOINT=checkpoints/bert_345m -VOCAB_FILE=bert-vocab.txt -CHECKPOINT_PATH=checkpoints/bert_345m_mnli - -python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ - --task MNLI \ - --seed 1234 \ - --train-data $TRAIN_DATA \ - --valid-data $VALID_DATA \ - --tokenizer-type BertWordPieceLowerCase \ - --vocab-file $VOCAB_FILE \ - --epochs 5 \ - --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ - --tensor-model-parallel-size 1 \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --micro-batch-size 8 \ - --activations-checkpoint-method uniform \ - --lr 5.0e-5 \ - --lr-decay-style linear \ - --lr-warmup-fraction 0.065 \ - --seq-length 512 \ - --max-position-embeddings 512 \ - --save-interval 500000 \ - --save $CHECKPOINT_PATH \ - --log-interval 10 \ - --eval-interval 100 \ - --eval-iters 50 \ - --weight-decay 1.0e-1 \ - --fp16 diff --git a/examples/finetune_race_distributed.sh b/examples/finetune_race_distributed.sh deleted file mode 100755 index e7f70a70abe..00000000000 --- a/examples/finetune_race_distributed.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -WORLD_SIZE=8 - -DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -TRAIN_DATA="data/RACE/train/middle" -VALID_DATA="data/RACE/dev/middle \ - data/RACE/dev/high" -VOCAB_FILE=bert-vocab.txt -PRETRAINED_CHECKPOINT=checkpoints/bert_345m -CHECKPOINT_PATH=checkpoints/bert_345m_race - -python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ - --task RACE \ - --seed 1234 \ - --train-data $TRAIN_DATA \ - --valid-data $VALID_DATA \ - --tokenizer-type BertWordPieceLowerCase \ - --vocab-file $VOCAB_FILE \ - --epochs 3 \ - --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ - --tensor-model-parallel-size 1 \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --micro-batch-size 4 \ - --activations-checkpoint-method uniform \ - --lr 1.0e-5 \ - --lr-decay-style linear \ - --lr-warmup-fraction 0.06 \ - --seq-length 512 \ - --max-position-embeddings 512 \ - --save-interval 100000 \ - --save $CHECKPOINT_PATH \ - --log-interval 10 \ - --eval-interval 100 \ - --eval-iters 50 \ - --weight-decay 1.0e-1 \ - --clip-grad 1.0 \ - --hidden-dropout 0.1 \ - --attention-dropout 0.1 \ - --fp16 diff --git a/examples/finetune_retriever_distributed.sh b/examples/finetune_retriever_distributed.sh deleted file mode 100755 index 535a2e053d4..00000000000 --- a/examples/finetune_retriever_distributed.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -# Finetune a BERT or pretrained ICT model using Google natural question data -# Datasets can be downloaded from the following link: -# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py - -WORLD_SIZE=8 - -DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -CHECKPOINT_PATH= - -# Load either of the below -BERT_LOAD_PATH= -PRETRAINED_CHECKPOINT= - -python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ - --task RET-FINETUNE-NQ \ - --train-with-neg \ - --train-hard-neg 1 \ - --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ - --num-layers 12 \ - --hidden-size 768 \ - --num-attention-heads 12 \ - --tensor-model-parallel-size 1 \ - --tokenizer-type BertWordPieceLowerCase \ - --train-data nq-train.json \ - --valid-data nq-dev.json \ - --save ${CHECKPOINT_PATH} \ - --load ${CHECKPOINT_PATH} \ - --vocab-file bert-vocab.txt \ - --bert-load ${BERT_LOAD_PATH} \ - --save-interval 5000 \ - --log-interval 10 \ - --eval-interval 20000 \ - --eval-iters 100 \ - --indexer-log-interval 1000 \ - --faiss-use-gpu \ - --DDP-impl torch \ - --fp16 \ - --retriever-report-topk-accuracies 1 5 10 20 100 \ - --seq-length 512 \ - --retriever-seq-length 256 \ - --max-position-embeddings 512 \ - --retriever-score-scaling \ - --epochs 80 \ - --micro-batch-size 8 \ - --eval-micro-batch-size 16 \ - --indexer-batch-size 128 \ - --lr 2e-5 \ - --lr-warmup-fraction 0.01 \ - --weight-decay 1e-1 diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md new file mode 100644 index 00000000000..8d6f2674163 --- /dev/null +++ b/examples/gpt3/README.md @@ -0,0 +1,57 @@ +# GPT3 MODEL + +## Table of contents +- [1. Training Setup](#1-training-setup) +- [2. Configurations](#2-configurations) +- [3. Training Results](#3-training-results) + +## 1. Training setup + + +To run the model using a docker container run it as follows +``` +PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 +CHECKPOINT_PATH="" # +TENSORBOARD_LOGS_PATH=""# +VOCAB_FILE="" #/gpt2-vocab.json +MERGE_FILE="" #/gpt2-merges.txt +DATA_PATH="" #_text_document + +docker run \ + --gpus=all \ + --ipc=host \ + --workdir /workspace/megatron-lm \ + -v /path/to/data:/path/to/data \ + -v /path/to/megatron-lm:/workspace/megatron-lm \ + megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ + bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH " + +``` +NOTE: Depending on the environment you are running it the above command might like slightly different. + + +## 2. Configurations + +The example in this folder shows you how to run 175B model. There are other configs you could run as well + +### 345M +``` + --num-layers 12 \ + --hidden-size 512 \ + --num-attention-heads 8 \ + --seq-length 1024 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` + +### 857M +``` + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --seq-length 2048 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + +``` diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh new file mode 100755 index 00000000000..be00d76120d --- /dev/null +++ b/examples/gpt3/train_gpt3_175b_distributed.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Runs the "175B" parameter model + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +GPUS_PER_NODE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NUM_NODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +CHECKPOINT_PATH=$1 # +TENSORBOARD_LOGS_PATH=$2 # +VOCAB_FILE=$3 #/gpt2-vocab.json +MERGE_FILE=$4 #/gpt2-merges.txt +DATA_PATH=$5 #_text_document + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +GPT_MODEL_ARGS=( + --num-layers 96 + --hidden-size 12288 + --num-attention-heads 96 + --seq-length 2048 + --max-position-embeddings 2048 + --attention-backend auto # Can use (flash/fused/unfused/local) +) + +TRAINING_ARGS=( + --micro-batch-size 1 + --global-batch-size 1536 + --step-batch-size-schedule "0:16 2.4B:320 4.8B:624 7.2B:928 9.6B:1232 12B:1536" + --train-iters 500000 + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --init-method-std 0.006 + --clip-grad 1.0 + --fp16 + --lr 6.0e-5 + --lr-decay-style cosine + --min-lr 6.0e-6 + --lr-warmup-fraction .001 + --lr-decay-iters 430000 +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 16 +) + +DATA_ARGS=( + --data-path $DATA_PATH + --vocab-file $VOCAB_FILE + --merge-file $MERGE_FILE + --split 949,50,1 +) + +EVAL_AND_LOGGING_ARGS=( + --log-interval 100 + --save-interval 10000 + --eval-interval 1000 + --save $CHECKPOINT_PATH + --load $CHECKPOINT_PATH + --eval-iters 10 + --tensorboard-dir $TENSORBOARD_LOGS_PATH +) + +torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ + ${GPT_MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} diff --git a/examples/gptoss/01_convert_from_hf.py b/examples/gptoss/01_convert_from_hf.py new file mode 100644 index 00000000000..adee3358ec3 --- /dev/null +++ b/examples/gptoss/01_convert_from_hf.py @@ -0,0 +1,55 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Convert HuggingFace checkpoints to Megatron format.""" + +import os +import argparse + +from megatron.bridge import AutoBridge + +def _parse_args(): + parser = argparse.ArgumentParser(description="Convert HF LLMs to Megatron format") + parser.add_argument( + "--hf-model", + type=str, + required=True, + help="HuggingFace model identifier or path", + ) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save the converted Megatron checkpoint", + ) + parser.add_argument('--local-rank', '--local_rank', type=int, default=0) + return parser.parse_args() + +if __name__ == "__main__": + args = _parse_args() + HF_MODEL = args.hf_model + SAVE_PATH = args.save_path + WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) + + if SAVE_PATH is None: + SAVE_PATH = f"./megatron_checkpoints/{HF_MODEL.replace('/', '_')}" + + print(f"Converting {HF_MODEL} to Megatron format...") + print(f"Save path: {SAVE_PATH}") + + bridge = AutoBridge.from_hf_pretrained(HF_MODEL, trust_remote_code=True) + provider = bridge.to_megatron_provider() + # Update these configs as needed + provider.expert_tensor_parallel_size = 1 + provider.tensor_model_parallel_size = 1 + provider.pipeline_model_parallel_size = WORLD_SIZE + provider.finalize() + + model = provider.provide_distributed_model(wrap_with_ddp=False) + + bridge.save_megatron_model( + model, + SAVE_PATH, + hf_tokenizer_path=HF_MODEL + ) + + print(f"Saved Megatron checkpoint to {SAVE_PATH}") diff --git a/examples/gptoss/02_train.sh b/examples/gptoss/02_train.sh new file mode 100755 index 00000000000..d129adc2b84 --- /dev/null +++ b/examples/gptoss/02_train.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} + + +# Setup arguments with defaults +CHECKPOINT_PATH="NO_VALUE_PROVIDED" +TENSORBOARD_LOGS_PATH="./tensorboard_logs/" +TOKENIZER_ARG="MOCK" +DATA_ARG="MOCK" +DISTRIBUTED_CONFIG_FILE="" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --checkpoint-path) + CHECKPOINT_PATH="$2" + shift 2 + ;; + --tensorboard-logs-path) + TENSORBOARD_LOGS_PATH="$2" + shift 2 + ;; + --tokenizer) + TOKENIZER_ARG="$2" + shift 2 + ;; + --data) + DATA_ARG="$2" + shift 2 + ;; + --distributed-config-file) + DISTRIBUTED_CONFIG_FILE="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --checkpoint-path PATH Path to Megatron checkpoint" + echo " --tensorboard-logs-path PATH Path to TensorBoard logs" + echo " --tokenizer PATH|MOCK Path to tokenizer model, or 'MOCK' (default: MOCK)" + echo " --data PATH|MOCK Data prefix, or 'MOCK' (default: MOCK)" + echo " --distributed-config-file FILE Path to distributed training config file" + echo " -h, --help Show this help message" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Check if checkpoint path exists +if [ ! -d "$CHECKPOINT_PATH" ]; then + echo "Error: Checkpoint path does not exist: $CHECKPOINT_PATH" + exit 1 +fi +echo "Checkpoint path exists: $CHECKPOINT_PATH" + +# Check if tensorboard logs path exists +if [ ! -d "$TENSORBOARD_LOGS_PATH" ]; then + echo "Warning: TensorBoard logs path does not exist. Creating: $TENSORBOARD_LOGS_PATH" + mkdir -p "$TENSORBOARD_LOGS_PATH" +fi +echo "TensorBoard logs path exists: $TENSORBOARD_LOGS_PATH" + +# NOTE: by default we use 8 GPUs +# These values will be over-written below with environmental variables +GPUS_PER_NODE=8 +NUM_NODES=1 +MASTER_ADDR="localhost" +MASTER_PORT=6000 +NODE_RANK=0 + +# Load distributed config from file if provided +if [ -n "$DISTRIBUTED_CONFIG_FILE" ]; then + if [ ! -f "$DISTRIBUTED_CONFIG_FILE" ]; then + echo "Warning: Distributed config file does not exist: $DISTRIBUTED_CONFIG_FILE" + echo "Continuing with default distributed training settings." + else + echo "Loading distributed config from: $DISTRIBUTED_CONFIG_FILE" + source "$DISTRIBUTED_CONFIG_FILE" + fi +fi + +# Override with environment variables if set +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +NUM_NODES=${NUM_NODES:-1} +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-6000} +NODE_RANK=${NODE_RANK:-0} +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +# Path to the pretrain_gpt.py script, assuming this script is run from the root of the Megatron-LM repository +PRETRAIN_SCRIPT_PATH="pretrain_gpt.py" + +# Data cache path (useful for both mock and real data) +DATA_CACHE_PATH="${PWD}/benchmark_cache_gpt_oss_20b" +mkdir -p "$DATA_CACHE_PATH" + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT + --node_rank $NODE_RANK +) + +# NOTE: we only set pipeline parallelism to be the number of GPUs +# Adjust each value based on your setup. +TP_SIZE=1 +EP_SIZE=1 +PP_SIZE=${WORLD_SIZE} +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=128 +NUM_LAYERS=12 +DTYPE="fp8" +SEQ_LENGTH=8192 +MAX_POSITION_EMBEDDINGS=8192 +TRAIN_SAMPLES=1953125000 +LR_DECAY_SAMPLES=1949218748 + +MODEL_ARGS=( + --no-masked-softmax-fusion + --transformer-impl transformer_engine + --disable-bias-linear + --untie-embeddings-and-output-weights + --no-rope-fusion + --normalization RMSNorm + --num-layers ${NUM_LAYERS} + --hidden-size 512 + --ffn-hidden-size 2048 + --num-attention-heads 64 + --group-query-attention + --num-query-groups 8 + --seq-length ${SEQ_LENGTH} + --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} + --use-mcore-models + --rotary-percent 1.0 + --rope-type rope + --position-embedding-type rope + --rotary-base 10000 + --no-bias-gelu-fusion + --export-force-local-attention + --no-bias-dropout-fusion + --quick-geglu + --glu-linear-offset 1.0 + --softmax-type learnable + --window-attn-skip-freq 2 + --activation-func-clamp-value 7.0 + --window-size 127,0 + --enable-gpt-oss +) + +MOE_ARGS=( + --num-experts 4 + --moe-router-topk 2 + --moe-router-load-balancing-type aux_loss + --moe-aux-loss-coeff 1e-3 + --moe-grouped-gemm + --moe-token-dispatcher-type alltoall + --overlap-param-gather + --overlap-grad-reduce + --moe-ffn-hidden-size 2048 + --moe-router-dtype fp32 + --moe-z-loss-coeff 1e-3 + --moe-permute-fusion +) + +DATA_ARGS_LIST=() +if [[ "$TOKENIZER_ARG" == "MOCK" ]] || [[ "$DATA_ARG" == "MOCK" ]] || [[ -z "$TOKENIZER_ARG" ]]; then + DATA_ARGS_LIST+=( + "--mock-data" + "--tokenizer-type NullTokenizer" + "--vocab-size 128256" + "--data-cache-path ${DATA_CACHE_PATH}" + "--tiktoken-pattern v2" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + ) +else + # Settings for real data + DATA_ARGS_LIST+=( + "--data-path $DATA_ARG" + "--tokenizer-type HuggingFaceTokenizer" + "--tokenizer-model $TOKENIZER_ARG" + "--data-cache-path ${DATA_CACHE_PATH}" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + # Note: --vocab-size might be inferred by HuggingFaceTokenizer or might need to be explicit. + "--vocab-size 128256" + ) +fi + +TRAINING_ARGS=( + --micro-batch-size ${MICRO_BATCH_SIZE} + --global-batch-size ${GLOBAL_BATCH_SIZE} + --lr 1.0e-5 + --train-samples ${TRAIN_SAMPLES} + --lr-decay-samples ${LR_DECAY_SAMPLES} + --lr-decay-style cosine + --min-lr 1.0e-6 + --weight-decay 0.1 + --lr-warmup-fraction 0.05 + --clip-grad 1.0 + --bf16 + --use-flash-attn + --attention-softmax-in-fp32 + --accumulate-allreduce-grads-in-fp32 + --disable-bf16-reduced-precision-matmul + --recompute-activations +) + +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size ${TP_SIZE} + --pipeline-model-parallel-size ${PP_SIZE} + --expert-model-parallel-size ${EP_SIZE} + --sequence-parallel + --context-parallel-size 1 + --use-distributed-optimizer + --fp8-format hybrid + --fp8-param-gather + --fp8-amax-compute-algo max + --fp8-amax-history-len 1024 +) + +LOGGING_ARGS=( + --log-interval 1 + --save-interval 10000 + --eval-interval 50000000 + --eval-iters 0 + --save $CHECKPOINT_PATH + --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" + --moe-per-layer-logging + --no-load-optim + --no-load-rng + --log-throughput +) + +# Ensure pretrain_gpt.py is found +if [ ! -f "$PRETRAIN_SCRIPT_PATH" ]; then + echo "Error: pretrain_gpt.py not found at $PRETRAIN_SCRIPT_PATH" + echo "Please ensure you are running this script from the root of the Megatron-LM repository, and pretrain_gpt.py is present." + exit 1 +fi + +python -m torch.distributed.run ${DISTRIBUTED_ARGS[@]} ${PRETRAIN_SCRIPT_PATH} \ + ${MODEL_ARGS[@]} \ + ${MOE_ARGS[@]} \ + ${DATA_ARGS_LIST[@]} \ + ${TRAINING_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${LOGGING_ARGS[@]} diff --git a/examples/gptoss/03_convert_to_hf.py b/examples/gptoss/03_convert_to_hf.py new file mode 100644 index 00000000000..8089afec854 --- /dev/null +++ b/examples/gptoss/03_convert_to_hf.py @@ -0,0 +1,52 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Convert HuggingFace checkpoints to Megatron format.""" + +import os +import argparse + +from megatron.bridge import AutoBridge + +def _parse_args(): + parser = argparse.ArgumentParser(description="Convert Megatron LLMs to HuggingFace format") + parser.add_argument( + "--hf-model", + type=str, + required=True, + help="HuggingFace model identifier or path to load config from", + ) + parser.add_argument( + "--megatron-model", + type=str, + required=True, + help="Megatron model identifier or path", + ) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save the converted HuggingFace checkpoint", + ) + parser.add_argument('--local-rank', '--local_rank', type=int, default=0) + return parser.parse_args() + +if __name__ == "__main__": + args = _parse_args() + HF_MODEL = args.hf_model + MEGATRON_MODEL = args.megatron_model + SAVE_PATH = args.save_path + WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) + + if SAVE_PATH is None: + SAVE_PATH = f"./huggingface_checkpoints/{MEGATRON_MODEL.replace('/', '_')}" + + print(f"Converting {MEGATRON_MODEL} to HuggingFace {HF_MODEL} format...") + print(f"Save path: {SAVE_PATH}") + + bridge = AutoBridge.from_hf_pretrained(HF_MODEL, trust_remote_code=True) + bridge.export_ckpt( + MEGATRON_MODEL, + SAVE_PATH, + ) + + print(f"Saved HuggingFace checkpoint to {SAVE_PATH}") diff --git a/examples/gptoss/README.md b/examples/gptoss/README.md new file mode 100644 index 00000000000..eeb92ad9953 --- /dev/null +++ b/examples/gptoss/README.md @@ -0,0 +1,153 @@ +# GPT-OSS Training Tutorial + +## Step 0: Install Dependencies + +### Using Megatron Bridge + +[Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) + +Megatron Bridge provides a quick and convenient way to convert HuggingFace checkpoints to the Megatron format used by Megatron-LM. Follow the instructions in the [Megatron-Bridge Installation](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/README.md#-installation) to run the nemo docker container and convert checkpoints (via mounted volumes - make sure that the huggingface cache location AND the megatron checkpoint locations are properly mounted, otherwise you may not be saving the converted model to disk correctly). + +Below is an example of how to use Megatron-Bridge inside the pytorch container to convert a HuggingFace model checkpoint to Megatron format. + +Reference: [Megatron-Bridge Dockerfile](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/docker/Dockerfile.ci) + +Inside the [pytorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) run the following commands to install Megatron-Bridge: +```bash +cd /opt +git clone --recursive https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge + +# Make sure submodules are initialized (for 3rdparty/Megatron-LM) +git submodule update --init --recursive + +export PATH="/root/.local/bin:$PATH" +export UV_PROJECT_ENVIRONMENT=/opt/venv +export VIRTUAL_ENV=/opt/venv +export PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH" +export UV_LINK_MODE=copy +export UV_VERSION="0.7.2" + +# Install UV +curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh + +# Create virtual environment and build the package +uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages + +uv sync --locked --only-group build +uv sync --locked --link-mode copy --all-extras --all-groups + +uv pip install --no-deps -e . + +source ${UV_PROJECT_ENVIRONMENT}/bin/activate +``` + +### Setup Environment + +```bash +export HOST_MEGATRON_LM_DIR="/path/to/your/host/megatron-lm" +git clone https://github.com/NVIDIA/Megatron-LM.git "$HOST_MEGATRON_LM_DIR" +cd "$HOST_MEGATRON_LM_DIR" +``` + +```bash +export HF_TOKEN={your_hf_token_here} +``` + +## Step 1: Convert HuggingFace to Megatron (Optional - skip if you already have a Megatron checkpoint) + +Set `--nproc-per-node` to be the number of GPUs per node. Set `hf_model_name` to be the Huggingface model e.g. `openai/gpt-oss-20b` + +```bash +python3 -m torch.distributed.launch --nproc-per-node=8 examples/gptoss/01_convert_from_hf.py --hf-model openai/gpt-oss-20b +``` + +## Step 2: Train from Scratch + +To train from scratch first follow the steps below to setup the environment appropriately before running the training script in docker. Even though we are running the same container as before, it is better to restart the container to ensure a clean environment and that all environment and docker variables are set correctly. For the following example we used 8x GB300, but you should change the number of GPUs and nodes as needed. + +### Setup Environment + +```bash +# Change these based on model and directory from previous conversion step +export MODEL_DIR_NAME="openai_gpt-oss_20b" + +export HOST_CHECKPOINT_PATH="./megatron_checkpoints/${MODEL_DIR_NAME}" +export HOST_TENSORBOARD_LOGS_PATH="./tensorboard_logs/${MODEL_DIR_NAME}" +``` + +By default we will use mock data to train the model in the example below. To use your own data, set the following environment variables: + +```bash +# Optional: For real data +export HOST_TOKENIZER_MODEL_PATH="/path/to/host/tokenizer.model" +export HOST_DATA_PREFIX="/path/to/host/mydata_prefix" +``` + +### Setup Training Configurations + +Run the following to create a `distributed_config.env` file with the appropriate distributed training configurations. Change the values as needed for your setup. This file will override the default values in `02_train.sh`. + +```bash +cat > ./distributed_config.env << 'EOF' +GPUS_PER_NODE=8 +NUM_NODES=1 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NODE_RANK=0 +EOF +``` + +### Run Container with Mounted Volumes + +**NOTE:** This container runs the example training script `02_train.sh` located in the `examples/gptoss` directory. By default, we have only set pipeline parallelism to be the number of GPUs. Adjust TP_SIZE, EP_SIZE, PP_SIZE, etc. in `02_train.sh`. You can also adjust modify `--hidden-size`, `--ffn-hidden-size`, `--num-attention-heads`, `NUM_LAYERS`, etc. + +To train using mock data, run the following command: +```bash +PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3" + +docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \ + -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \ + -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \ + -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \ + -v "./distributed_config.env:/workspace/megatron-lm/examples/gptoss/distributed_config.env" \ + --workdir /workspace/megatron-lm \ + $PYTORCH_IMAGE \ + bash examples/gptoss/02_train.sh \ + --checkpoint-path /workspace/checkpoints \ + --tensorboard-logs-path /workspace/tensorboard_logs \ + --distributed-config-file /workspace/megatron-lm/examples/gptoss/distributed_config.env \ + 2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_mock_$(date +'%y-%m-%d_%H-%M-%S').log" +``` +**Note:** If you run into issues generating mock data one solution might be to reduce the number of GPUs to 1 and try to generate the data again. + +If using real data with with the `HOST_TOKENIZER_MODEL_PATH` and `HOST_DATA_PREFIX` environment variables set, run the following command instead: + +```bash +PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3" + +docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \ + -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \ + -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \ + -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \ + -v "${HOST_TOKENIZER_MODEL_PATH}:/workspace/tokenizer_model" \ + -v "$(dirname "${HOST_DATA_PREFIX}"):/workspace/data_dir" \ + -v "./distributed_config.env:/workspace/megatron-lm/examples/gptoss/distributed_config.env" \ + --workdir /workspace/megatron-lm \ + $PYTORCH_IMAGE \ + bash examples/gptoss/02_train.sh \ + --checkpoint-path /workspace/checkpoints \ + --tensorboard-logs-path /workspace/tensorboard_logs \ + --tokenizer /workspace/tokenizer_model \ + --data "/workspace/data_dir/$(basename "${HOST_DATA_PREFIX}")" \ + --distributed-config-file /workspace/megatron-lm/examples/gptoss/distributed_config.env \ + 2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_custom_$(date +'%y-%m-%d_%H-%M-%S').log" +``` + +## Step 3: Convert Megatron to HuggingFace + +Just run the following command to change from the megatron checkpoint from training to the huggingface format to share with others (make sure you have the same virtual environment setup as in Step 0): + +```bash +python3 -m torch.distributed.launch --nproc-per-node=8 examples/gptoss/03_convert_to_hf.py --hf-model openai/gpt-oss-20b --megatron-model ./megatron_checkpoints/openai_gpt-oss_20b +``` \ No newline at end of file diff --git a/examples/inference/README.md b/examples/inference/README.md new file mode 100644 index 00000000000..3259bf7f943 --- /dev/null +++ b/examples/inference/README.md @@ -0,0 +1,288 @@ +### Megatron Core Inference Documentation +This guide provides an example for Megatron Core for running model inference. + +### Contents +- [Megatron Core Inference Documentation](#megatron-core-inference-documentation) +- [Contents](#contents) + - [1. Quick Start](#1-quick-start) + - [1.1 Understanding The Code](#11-understanding-the-code) + - [1.2 Running The Code](#12-running-the-code) + - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend) + - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline) + - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend) + - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller) + - [3.3. Support Other Models](#33-support-other-models) + - [3.3. Modify Inference Parameters](#33-modify-inference-parameters) + - [4. Future work](#4-future-work) + +
+ +#### 1. Quickstart +This example runs statically-batched inference on a model trained using Megatron Core. The entrypoint is [gpt_static_inference.py](./gpt/gpt_static_inference.py). A similar workflow can be adapted for [gpt_dynamic_inference.py](./gpt/gpt_dynamic_inference.py). + +
+ +##### 1.1 Code Walkthrough +***STEP 1 - Initialize model parallel and other default arguments*** +The micro batch size defaults to 1. It is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. +```python +# Initialize Megatron model using the same model provider from training. + initialize_megatron( + args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1} + ) +``` + +***STEP 2 - Load the model using the model_provider_function*** +The model provider function supports both MCore and Legacy models. + +```python + # Load the model checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + load_checkpoint(model, None, None) + model.eval() + model = model[0] +``` + +***STEP 3 - Choose an engine*** +Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future. +```python + # Create an inference wrapper to setup the model. + inference_wrapped_model = GPTInferenceWrapper(model, args) + + # Define a sampling loop. + text_generation_controller = TextGenerationController( + inference_wrapped_model=inference_wrapped_model, + tokenizer=tokenizer + ) + + # Create a static or dynamic inference engine. + inference_engine = StaticInferenceEngine( + text_generation_controller=text_generation_controller, + max_batch_size=args.max_batch_size +) +``` + +***STEP 4 - Run text generation*** +The [SamplingParams](../../megatron/core/inference/sampling_params.py) class uses suggested defaults. Customize this to change top_p, top_k, number of tokens to generate, etc. The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py). +```python + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, sampling_params=sampling_params + ) + + if torch.distributed.get_rank() == 0: + for idx, result in enumerate(results): + print(f' ------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens' : result.generated_tokens + } + print(result) +``` + +
+ +##### 1.2 Running The Code +An example Slurm script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. + +For a recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910). + +``` +# Slurm cluster settings +ACCOUNT= +MLM_PATH=/path/to/megatron-lm +GPT_CKPT=/path/to/gpt/ckpt +VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file +CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11 + +srun --account $ACCOUNT \ +--job-name=$ACCOUNT:inference \ +--partition=batch \ +--time=01:00:00 \ +--container-image $CONTAINER_IMAGE \ +--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \ +--no-container-mount-home \ +--pty /bin/bash \ + +# Inside the container run the following. + +cd megatron-lm/ +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +TOKENIZER_ARGS=( + --vocab-file /workspace/tokenizer/gpt2-vocab.json + --merge-file /workspace/tokenizer/gpt2-merges.txt + --tokenizer-type GPT2BPETokenizer +) + +MODEL_ARGS=( + --use-checkpoint-args + --use-mcore-models + --load /workspace/mcore_gpt_ckpt +) + +INFERENCE_SPECIFIC_ARGS=( + --attention-dropout 0.0 + --hidden-dropout 0.0 + --num-tokens-to-generate 20 + --max-batch-size 4 +) + +torchrun --nproc-per-node=4 examples/inference/gpt/gpt_static_inference.py \ + ${TOKENIZER_ARGS[@]} \ + ${MODEL_ARGS[@]} \ + ${INFERENCE_SPECIFIC_ARGS[@]} \ + --prompts "prompt one " "sample prompt two" "sample prompt 3" + +NOTE: Other parameters which can be customized for inference: +--temperature (Sampling temperature) +--top_k (top_k sampling) +--top_p (top_p sampling) +--num-tokens-to-generate (Number of tokens to generate for each prompt) +--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use microbatched pipelining.') +--use-dist-ckpt (If using dist checkpoint format for the model) + +``` + + +
+ + +#### 2. Control Flow in the MCore Backend +An example of inference with static batching is provided in [gpt_static_inference.py](./gpt/gpt_static_inference.py). +* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts. +* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. +* The engine will run until all requests (waiting + active) are completed. + * The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller . + * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop + * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks + * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits + * Output logits are synchronized across all pipeline parallel ranks + * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters. + * The sampled tokens are then appended to the input prompt tokens for the next iteration + * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition + * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. + * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool + +
+ +#### 3. Customizing The Inference Pipeline + +The inference pipeline supports three levels of customization: + +* **Inference engine** - The MCore Engine supports static and dynamic batching. Modify this to add a new backend. +* **Text generation controller** - The main sampling loop. Customize this to support alternative tokenization or implement a new sampling strategy. +* **Inference Wrapped Model** - Change this to support a new model. +* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, and other sampling parameters. + +
+ +##### 3.1. Create Your Own Inference Backend +The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. + +```python +class AbstractEngine(ABC): + @staticmethod + def generate(self) -> dict: + """The abstract backend's generate function. + + To define a new backend, implement this method and return the outputs as a dictionary. +``` + +
+ +##### 3.2. Implement a new Sampling Loop + +The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies. + +``` python +class TextGenerationController: + + def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the input prompts""" + + def sample_from_logits( + self, + last_token_logits: torch.Tensor, + sampling_params: SamplingParams, + vocab_size: int, + generation_started : Optional[torch.Tensor] = None, + top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, + ) -> torch.Tensor: + """Samples the logits to generate outputs + + Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. If sampling_params.top_n_logprobs > 0 + at each step it also updates the top_n_logprobs_dict. + """ + + def update_generation_status( + self, + updated_prompts_tokens: torch.Tensor, + generation_started: torch.Tensor, + current_context_end_position: int, + is_generation_done_tensor: torch.Tensor, + generated_sequence_lengths: torch.Tensor, + ) -> torch.Tensor: + """Function to check which prompts have reached an end condition + + We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating + """ + + def generate_all_output_tokens_static_batch( + self, active_requests: OrderedDict[int, InferenceRequest], + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate all the output tokens and probabilities for the prompts . + + This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests + """ + + def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: + """Detokenize the output generations""" +``` + +
+ +##### 3.3. Support Other Models +Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: +* Forward method which calls the model `forward` method depending on model parallel settings +* Initializes the model and puts it in `.eval()` mode +* Setup for the input parameters (max batch size, max seq length) + +The following methods should be implemented: +```python +class AbstractModelInferenceWrapper: + def prep_model_for_inference(self, prompts_tokens: torch.Tensor): + """A utility function for preparing model for inference + + The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass + """ + + @abc.abstractclassmethod + def get_batch_for_context_window(self) -> List: + """Returns the input data for inference + + This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. +``` + +Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel. + +
+ +##### 3.3. Modify Inference Parameters +We use [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this to change `top_p`, `top_k`, number of tokens to generate etc. Other attributes can be added for the inference loop as shown below. + +``` +from megatron.core.inference.sampling_params import SamplingParams + +c = SamplingParams(temperature=0.5) +c.add_attributes({'min_length':4, 'eod_id':153}) +``` + +
+ +#### 4. Future work +The following features are planned for future releases. +* TRTLLM Engine support +* Continuous batching optimizations +* Speculative decoding \ No newline at end of file diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py new file mode 100644 index 00000000000..02a257c1b46 --- /dev/null +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -0,0 +1,523 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +# pylint: disable=bad-builtin + +import hashlib +import io +import json +import os +import sys +import warnings +from collections import defaultdict +from typing import Dict, List, Optional + +from megatron.training.arguments import parse_and_validate_args +import torch +from tqdm import tqdm + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +from examples.inference.gpt.utils import ( + Request, + build_dynamic_engine_setup_prefix, + build_requests, + get_curr_time, + get_global_peak_memory_stats_bytes, +) +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( + GPTInferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, +) +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) +import logging + +import megatron +from megatron.core.utils import configure_nvtx_profiling +from megatron.training import get_args, get_tokenizer, initialize_megatron + +torch.serialization.add_safe_globals([io.BytesIO]) +torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunState]) +torch.serialization.add_safe_globals([megatron.core.rerun_state_machine.RerunDiagnostic]) + + +def run_inference( + requests: List[Request], + engine: DynamicInferenceEngine, + sampling_params: Optional[SamplingParams] = None, +) -> List[Dict[str, float]]: + """Add requests to engine and generate tokens. + + Args: + requests (List[Request]): Requests that are to be added and processed. + engine (DynamicInferenceEngine): Inference engine that manages generating tokens. + sampling_params (SamplingParams): Deprecated as of megatron-core 0.16. + + Return: + A dictionary of step times with `prefill` and `decode` keys. + """ + + if sampling_params is not None and torch.distributed.get_rank() == 0: + warnings.warn( + "The `sampling_params` argument is deprecated. " + "Sampling parameters are specified per request.", + DeprecationWarning, + ) + + args = get_args() + + # Parse batch boundaries for batch-drain mode. + batch_ranges = None + if args.drain_between_batches and args.batch_boundaries: + boundaries = [int(x) for x in args.batch_boundaries.split(",")] + num_requests_total = len(requests) + batch_ranges = [] + for i, start in enumerate(boundaries): + end = boundaries[i + 1] if i + 1 < len(boundaries) else num_requests_total + batch_ranges.append((start, end)) + + # Initialize request arrival times. + base_arrival_time = get_curr_time() + for request in requests: + request.time_arrival = request.time_offset + base_arrival_time + + # Add and process requests. + num_requests_total = len(requests) + num_requests_added = 0 + num_requests_finished = 0 + step_times = {"prefill": [], "decode": []} + add_times = [] + output_times = [] + tbar = tqdm(total=num_requests_total) + total_output_tokens = 0 + attempted_step_count = 0 + if args.cuda_graph_impl == "local": + cuda_graph_request_count_map = {} + else: + cuda_graph_request_count_map = None + + def _add_request(): + """Add request to engine. + + *Note: Using `prompt_text` instead of `prompt_tokens` for fair comparison. + """ + nonlocal num_requests_added + _request = requests[num_requests_added] + engine.add_request(num_requests_added, _request.prompt_text, _request.sampling_params) + _request.time_start = get_curr_time() + _request.state = "started" + num_requests_added += 1 + tbar.update(1) + + def _process_step_result(result): + """Process a single engine step result, updating bookkeeping state.""" + nonlocal total_output_tokens, num_requests_finished + + is_decode_only = engine.is_decode_only + + # Record cuda_graph_request_count. + cuda_graph_request_count = result["cuda_graph_request_count"] + if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None: + cuda_graph_request_count_map[cuda_graph_request_count] = ( + cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1 + ) + + # Update requests. + active_request_ids = result["active_request_ids"] + finished_request_records = result["finished_request_records"] + step_time = result["step_time"] + if len(active_request_ids) > 0 or len(finished_request_records) > 0: + if is_decode_only: + step_times["decode"].append(step_time) + else: + step_times["prefill"].append(step_time) + + # Append output tokens. + output_start = get_curr_time() + for finished_request_record in finished_request_records: + + finished_request = finished_request_record.merge() + + # Update local request object. + request = requests[finished_request.request_id] + request.time_end = get_curr_time() + request.state = "finished" + request.request_id = finished_request.request_id + request.events = finished_request.events + + request.ttft = finished_request.ttft + + # Update prompt, in case engine has been suspended and resumed. + request.prompt_tokens = finished_request.prompt_tokens.tolist() + request.prompt_text = finished_request.prompt + + # Get output tokens and text. + request.output_tokens = finished_request.generated_tokens + request.output_text = finished_request.generated_text + total_output_tokens += len(request.output_tokens) + + # Log probs. + if finished_request.sampling_params.return_log_probs: + if not finished_request.prompt_log_probs: + finished_request.prompt_log_probs = [] + request.prompt_log_probs = finished_request.prompt_log_probs + request.generated_log_probs = finished_request.generated_log_probs + request.logprobs = ( + finished_request.prompt_log_probs + finished_request.generated_log_probs + ) + if finished_request.sampling_params.top_n_logprobs > 0: + request.generated_top_n_logprobs = finished_request.generated_top_n_logprobs + if not finished_request.sampling_params.skip_prompt_log_probs: + request.prompt_top_n_logprobs = finished_request.prompt_top_n_logprobs + num_requests_finished += 1 + output_times.append(get_curr_time() - output_start) + + if batch_ranges is not None: + # Batch-drain mode: add all requests in a batch, drain, then next batch. + for batch_idx, (batch_start, batch_end) in enumerate(batch_ranges): + # Add all requests in current batch. + add_start = get_curr_time() + while num_requests_added < batch_end: + _add_request() + add_times.append(get_curr_time() - add_start) + + # Step until all active requests finish (drain). + while engine.has_unfinished_requests(): + try: + result = engine.step_modern() + except EngineSuspendedError as e: + result = e + attempted_step_count += 1 + + if isinstance(result, EngineSuspendedError): + continue + + _process_step_result(result) + else: + # Original mode: add requests per step based on arrival time or count. + while True: + # Add requests. + add_start = get_curr_time() + if args.incoming_requests_per_step is None: + # Add requests with 'earlier' arrival time. + while num_requests_added < num_requests_total: + if requests[num_requests_added].time_arrival > add_start: + break + _add_request() + else: + # Add deterministic number of requests (generally used for debugging). + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): + _add_request() + add_times.append(get_curr_time() - add_start) + + # Step inference engine (i.e., generate a token for each active request). + # Before step, we haven't done the scheduling, so we cannot know the is_decode_only + try: + result = engine.step_modern() + except EngineSuspendedError as e: + result = e + pass # ignore error in order to call 'engine.resume()' below. + attempted_step_count += 1 + + # Test suspending and resuming engine. + if args.suspend_resume_interval is not None: + + # Suspend. + if attempted_step_count % args.suspend_resume_interval == 0: + print("**** step %d/%d ... suspend." % (engine.context.step_count, attempted_step_count)) + engine.suspend() + + # Resume, 0+ attempted steps later. + if ( + attempted_step_count > 0 + and (attempted_step_count - args.suspend_resume_interval // 2) + % args.suspend_resume_interval + == 0 + ): + print("**** step %d/%d ... resume." % (engine.context.step_count, attempted_step_count)) + engine.resume() + + # If engine suspended, continue to next iter. + if isinstance(result, EngineSuspendedError): + continue + + _process_step_result(result) + + # Check if all requests are finished. + if not (engine.has_unfinished_requests() or num_requests_added < num_requests_total): + break + + # Resume engine (NOOP if not suspended). + engine.resume() + + return { + "step_times": step_times, + "add_times": add_times, + "output_times": output_times, + "total_output_tokens": total_output_tokens, + "cuda_graph_request_count_map": cuda_graph_request_count_map, + } + + +@torch.inference_mode() +def main(): + """Run dynamic inference.""" + # Initialize Megatron. + args = parse_and_validate_args( + extra_args_provider=add_inference_args, + args_defaults={'no_load_rng': True, 'no_load_optim': True}, + ) + initialize_megatron() + + # Start Nsight profiler. + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStart() + + level_str = os.getenv("LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_str, logging.INFO) + logging.basicConfig(level=level, force=True) + + configure_nvtx_profiling(True) + + # Build tokenizer + tokenizer = build_tokenizer(args) + + # Reset peak memory stats so functional tests measure this run and not + # whatever happened earlier during initialization. + torch.cuda.reset_peak_memory_stats() + + # Sampling params. + sampling_params = SamplingParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + skip_prompt_log_probs=args.skip_prompt_log_probs, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate, + termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod, + top_n_logprobs=args.top_n_logprobs, + stop_words=args.stop_words, + ) + + model = get_model_for_inference() + + # Requests, context, controller. + requests = build_requests(args, tokenizer, sampling_params) + inference_config = get_inference_config_from_model_and_args(model, args) + + # Calculate max_sequence_length from requests + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + inference_config.max_sequence_length = max_context_length + max_gen_length + context = DynamicInferenceContext(model.config, inference_config) + wrapped_model = GPTInferenceWrapper(model, context) + controller = TextGenerationController(wrapped_model, tokenizer) + + # Validate all context_length's <= max_tokens. + if not args.enable_chunked_prefill: + invalid_prompt_length_map = {} + for request_idx, request in enumerate(requests): + if len(request.prompt_tokens) > context.max_tokens: + invalid_prompt_length_map[request_idx] = len(request.prompt_tokens) + assert ( + not invalid_prompt_length_map + ), "request idxs with prompts longer than context.max_tokens: " ", ".join( + f"{k}({v})" for k, v in invalid_prompt_length_map.items() + ) + + # Inference engine. + engine = DynamicInferenceEngine(controller, context) + + setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests) + print("~~~") + print(setup_prefix) + print("~~~") + + # Run and time test, optionally `args.inference_repeat_n` times. + throughputs = [] + for _ in range(args.inference_repeat_n): + + # Reset engine. + engine.reset() + + torch.cuda.reset_peak_memory_stats() + + # Trial. + t = get_curr_time() + result = run_inference(requests, engine) + step_times = result["step_times"] + add_times = result["add_times"] + output_times = result["output_times"] + total_output_tokens = result["total_output_tokens"] + torch.cuda.synchronize() + total_time = get_curr_time() - t + stats = torch.cuda.memory_stats() + throughput = total_output_tokens / total_time + throughputs.append(throughput) + + # Validate all requests finished. + for request in requests: + assert request.state == "finished", f"request.state == '{request.state}' != 'finished'." + + peak_mem_stats = get_global_peak_memory_stats_bytes() + + # Print unique prompts + outputs. + if torch.distributed.get_rank() == 0: + + def escape_str(s): + return s.replace("\n", "\\n") + + print("~~~~ Unique prompts + outputs. ~~~~") + + # Map requests by their prompt. + unique_prompt_map = defaultdict(list) + for request_idx, request in enumerate(requests): + unique_prompt_map[request.prompt_text].append(request_idx) + + # Print unique prompts + outputs. + text_hashes = [] + for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()): + + # ---- Prompt summary line ---- + prompt_len = len(requests[request_idxs[0]].prompt_tokens) + escaped_prompt_text = escape_str(prompt_text) + print( + f"\n{unique_idx+1}/{len(unique_prompt_map)}" + f"[n {len(request_idxs)}, l {prompt_len}] {escaped_prompt_text}" + ) + + # ---- Group all outputs for this prompt ---- + output_map = defaultdict(list) + for idx in request_idxs: + req = requests[idx] + output_map[req.output_text].append(idx) + + # ---- Print each unique output ---- + for output_text, output_request_idxs in output_map.items(): + evicted = False + for idx in output_request_idxs: + for event in requests[idx].events: + if event.type.name == "EVICT": + evicted = True + break + if output_text is not None: + # Use hash of prompt + generated text in case engine was + # suspended and resumed, which misaligns boundary between + # prompt and generated tokens. + o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] + o_len = len(requests[output_request_idxs[0]].output_tokens) + escaped_output_text = escape_str(output_text) + else: + o_hash = "--" + o_len = 0 + escaped_output_text = "--" + print( + f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" + f"{', ' if evicted else ''}] {escaped_output_text}" + ) + text_hashes.append(o_hash) + + # Write results to JSON. Primarily used for functional testing. + if args.output_path: + json_results = {} + + # Write every 'n' requests, plus the final request. + for i, req in enumerate(requests): + if i % args.output_every_n_results == 0 or i == len(requests) - 1: + print(f' Attributes of request {i}: {req.__dict__}') + result_dict = { + "input_prompt": req.prompt_text, + "generated_text": req.output_text, + "generated_tokens": req.output_tokens, + "latency": req.time_end - req.time_start, + "ttft": req.ttft, # Time-to-first-token in seconds + "cuda_graph_request_count_map": result["cuda_graph_request_count_map"], + "step_count": engine.context.step_count, + "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), + } + if req.sampling_params.return_log_probs: + result_dict["prompt_logprobs"] = getattr(req, 'prompt_log_probs', None) + result_dict["generated_logprobs"] = getattr( + req, 'generated_log_probs', None + ) + result_dict["logprobs"] = getattr(req, 'logprobs', None) + if args.output_request_events: + result_dict["events"] = [e.serialize() for e in req.events] + json_results[req.request_id] = result_dict + + # Track system-level throughput as a test / debug metric + if args.record_throughput: + json_results["throughput"] = throughputs + # Attach peak memory metrics; the functional test only validates these + # if the fields exist in the golden values. + json_results.update(peak_mem_stats) + json_results["lifetime_prefill_token_count"] = engine.context.lifetime_prefill_token_count + + print(f' Saving results to {args.output_path}') + with open(args.output_path, "w") as fp: + json.dump(json_results, fp, indent=1) + + # Timing results. + stats = torch.cuda.memory_stats() + throughput = total_output_tokens / total_time + print("~~~") + peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3 + peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3 + + p_times = step_times["prefill"] + d_times = step_times["decode"] + + p_total = sum(p_times) + d_total = sum(d_times) + + p_count = len(p_times) + d_count = len(d_times) + + p_mean = p_total / p_count + d_mean = d_total / d_count if d_count != 0 else 0.0 + + # Commented out for now as the step/add/output times are not calculated correctly. + # print( + # f"{setup_prefix} … " + # f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + # f"total time: {step_total:.3f}s … " + # f"step time: total {step_total:.3f}s " + # f"[ p {p_total:.3f}s, d {d_total:.3f}s ], " + # f"mean [ p {p_mean:.3f}s, d {d_mean:.3f}s ], " + # f"count [ p {p_count}, d {d_count} ]." + # ) + capture_str = f"{engine.capture_stats['time']:.2f} sec" if engine.capture_stats else "--" + print( + f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", + f"total time: {total_time:.3f}s … " + f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + f"steps: {engine.context.step_count:d} … " + f"capture {capture_str}", + ) + print("~~~") + + # Stop Nsight profiler. + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStop() + + +if __name__ == "__main__": + main() diff --git a/examples/inference/gpt/gpt_dynamic_inference_12b.sh b/examples/inference/gpt/gpt_dynamic_inference_12b.sh new file mode 100644 index 00000000000..ca21bb170a5 --- /dev/null +++ b/examples/inference/gpt/gpt_dynamic_inference_12b.sh @@ -0,0 +1,127 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +# Run dynamic batching inference on the 12B GPT model. + +set -u + +# Libraries. +pip install simpy +pip install sentencepiece +pip install tiktoken + +# Environment variables. +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Checkpoint. +: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"} +: ${TOKENIZER_MODEL:?"TOKENIZER_MODEL is not set"} + +# Prompts. +: ${NUM_TOKENS_TO_PROMPT="8 32"} +: ${NUM_TOKENS_TO_GENERATE=256} +: ${INCOMING_REQUESTS_DURATION=10.} +: ${INCOMING_REQUESTS_PER_SEC=100.} + +# Dynamic context. +: ${BUFFER_SIZE_GB=50.} + +# Cuda graphs. +: ${NUM_CUDA_GRAPHS=16} + +# Miscellaneous. +: ${USE_COORDINATOR=0} +: ${ENGINE=dynamic} +: ${EXTRA_ARGS=""} +# NSIGHT_PREFIX=/path/to/nsight/profile + +# Arguments. +ARGS=" \ + --no-persist-layer-norm \ + --apply-layernorm-1p \ + --no-position-embedding \ + --group-query-attention \ + --num-query-groups 8 \ + --load ${CHECKPOINT_DIR} \ + --use-checkpoint-args \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --use-rotary-position-embeddings \ + --position-embedding-type rope \ + --rotary-base 1000000 \ + --rotary-percent 1.0 \ + --swiglu \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --exit-duration-in-mins 5740 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 40 \ + --hidden-size 5120 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 32 \ + --kv-channels 128 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --micro-batch-size 64 \ + --bf16 \ + --tokenizer-type TikTokenizer \ + --tiktoken-pattern v2 \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --distributed-timeout-minutes 2400 \ + --use-flash-attn \ + --inference-rng-tracker \ + \ + --inference-dynamic-batching \ + --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ + \ + ${EXTRA_ARGS} \ +" + +# Cuda graphs. +if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then + ARGS+=" \ + --cuda-graph-impl local \ + --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ + " +else + ARGS+=" \ + --cuda-graph-impl none \ + " +fi + +# Prompts. +if [[ -v PROMPTS ]]; then + ARGS+=" \ + --prompts ${PROMPTS} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + " +elif [[ -v PROMPT_FILE ]]; then + ARGS+=" \ + --prompt-file ${PROMPT_FILE} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + " +else + ARGS+=" \ + --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \ + --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \ + " +fi + +# Command. +if [[ "${USE_COORDINATOR}" == "0" ]]; then + CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}" +else + CMD="python -um examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}" +fi + +if [[ -v NSIGHT_PREFIX ]]; then + CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}" +fi + +echo "~~~" +echo "CMD ... ${CMD}." +echo "~~~" +eval ${CMD} diff --git a/examples/inference/gpt/gpt_dynamic_inference_357m.sh b/examples/inference/gpt/gpt_dynamic_inference_357m.sh new file mode 100644 index 00000000000..cc99bdddec1 --- /dev/null +++ b/examples/inference/gpt/gpt_dynamic_inference_357m.sh @@ -0,0 +1,115 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +# Run dynamic batching inference on the 357M GPT model. + +set -u + +# Libraries. +pip install simpy +pip install sentencepiece +pip install tiktoken + +# Environment variables. +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Checkpoint. +: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"} +: ${VOCAB_FILE:?"VOCAB_FILE is not set"} +: ${MERGE_FILE:?"MERGE_FILE is not set"} + +# Prompts. +: ${NUM_TOKENS_TO_PROMPT="8 32"} +: ${NUM_TOKENS_TO_GENERATE=256} +: ${INCOMING_REQUESTS_DURATION=10.} +: ${INCOMING_REQUESTS_PER_SEC=100.} + +# Dynamic context. +: ${BUFFER_SIZE_GB=50.} + +# Cuda graphs. +: ${NUM_CUDA_GRAPHS=16} + +# Miscellaneous. +: ${USE_COORDINATOR=0} +: ${ENGINE=dynamic} +: ${NPROC_PER_NODE=1} +: ${EXTRA_ARGS=""} +# NSIGHT_PREFIX=/path/to/nsight/profile + +# Arguments. +ARGS=" \ + --exit-on-missing-checkpoint \ + --transformer-impl local \ + --load ${CHECKPOINT_DIR} \ + --tokenizer-type GPT2BPETokenizer \ + --vocab-file ${VOCAB_FILE} \ + --merge-file ${MERGE_FILE} \ + --exit-on-missing-checkpoint \ + --max-position-embeddings 2048 \ + --seq-length 2048 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --num-attention-heads 16 \ + --hidden-size 1024 \ + --bf16 \ + --micro-batch-size 1 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --seed 42 \ + --use-flash-attn \ + --inference-rng-tracker \ + \ + --inference-dynamic-batching \ + --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ + \ + ${EXTRA_ARGS} \ +" + +# Cuda graphs. +if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then + ARGS+=" \ + --cuda-graph-impl local \ + --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ + " +else + ARGS+=" \ + --cuda-graph-impl none \ + " +fi + +# Prompts. +if [[ -v PROMPTS ]]; then + ARGS+=" \ + --prompts ${PROMPTS} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + " +elif [[ -v PROMPT_FILE ]]; then + ARGS+=" \ + --prompt-file ${PROMPT_FILE} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + " +else + ARGS+=" \ + --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \ + --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ + --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \ + --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \ + " +fi + +# Command. +if [[ "${USE_COORDINATOR}" == "0" ]]; then + CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}" +else + CMD="python -m torch.distributed.run --nproc-per-node ${NPROC_PER_NODE} -m examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}" +fi + +if [[ -v NSIGHT_PREFIX ]]; then + CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}" +fi + +echo "~~~" +echo "CMD ... ${CMD}." +echo "~~~" +eval ${CMD} diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py new file mode 100644 index 00000000000..aa42f492ca4 --- /dev/null +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -0,0 +1,248 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import asyncio +import json +import logging +import os +import time +import warnings +from collections import defaultdict +from typing import List + +from megatron.training.arguments import parse_and_validate_args +import torch +import torch.distributed as dist + +from examples.inference.gpt.utils import Request, build_dynamic_engine_setup_prefix, build_requests +from megatron.core.inference.engines import DynamicInferenceEngine +from megatron.core.inference.engines.dynamic_engine import EngineState +from megatron.core.inference.inference_client import InferenceClient +from megatron.core.inference.inference_request import DynamicInferenceRequestRecord +from megatron.core.inference.sampling_params import SamplingParams +from megatron.inference.utils import ( + add_inference_args, + get_dynamic_inference_engine, + get_model_for_inference, +) +from megatron.training import get_args, get_tokenizer, initialize_megatron +from megatron.core.utils import configure_nvtx_profiling + +# pylint: disable=line-too-long + +logging.basicConfig(level=logging.INFO, force=True) + + +async def suspend_resume_cycle(client, engine, args, futures): + """Wait for all in-flight requests, then suspend/train/resume.""" + await asyncio.gather(*futures) + + client.pause_engines() + await engine.wait_until(EngineState.PAUSED) + client.suspend_engines() + await engine.wait_until(EngineState.SUSPENDED) + if args.suspend_timeout > 0: + await asyncio.sleep(args.suspend_timeout) + client.resume_engines() + await engine.wait_until(EngineState.RESUMED) + client.unpause_engines() + await engine.wait_until(EngineState.RUNNING) + + +async def main( + engine: DynamicInferenceEngine, + requests: List[Request], + port: int | None = None, + sampling_params: SamplingParams | None = None, +): + if sampling_params is not None: + warnings.warn( + "The `sampling_params` argument is deprecated. " + "Sampling parameters are specified per request.", + DeprecationWarning, + ) + + # once you call engine.start_listening_to_data_parallel_coordinator, + # the engine will start accepting requests from the data parallel coordinator. + # and processing them in an asyncio coroutine. + # leaving inference_coordinator_port as None will find a free port automatically. + args = get_args() + + dp_addr = await engine.start_listening_to_data_parallel_coordinator( + inference_coordinator_port=port, + launch_inference_coordinator=True, + coordinator_schedule_output_path=args.coordinator_schedule_output_path, + ) + + # All ranks agree on the number of suspend/resume cycles from args. + num_suspend_resume_cycles = len(requests) // args.suspend_resume_interval if args.suspend_resume_interval else 0 + + # Create client and run example. + if dist.get_rank() == 0: + client = InferenceClient(dp_addr, deserialize=True) # submits requests to the inference coordinator + client.start() + base_arrival_time = time.time_ns() / 10**9 + for request in requests: + request.time_arrival = request.time_offset + base_arrival_time + futures = [] + num_requests_total = len(requests) + num_requests_added = 0 + next_suspend_at = args.suspend_resume_interval or 0 + cycles_done = 0 + + while True: + current_time = time.time_ns() / 10**9 + if args.incoming_requests_per_step is None: + # Only add requests that have arrived at the current time. + while ( + num_requests_added < num_requests_total + and requests[num_requests_added].time_arrival <= current_time + ): + request = requests[num_requests_added] + # These add-request calls will queue up the request on a zmq socket and return + # instantaneously. They will return an asyncio future which can be awaited for + # request completion. + futures.append(client.add_request(request.prompt_text, request.sampling_params)) + num_requests_added += 1 + + if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles: + await suspend_resume_cycle(client, engine, args, futures) + cycles_done += 1 + next_suspend_at += args.suspend_resume_interval + + else: + # Add deterministic number of requests (generally used for debugging). + for i in range( + min(args.incoming_requests_per_step, num_requests_total - num_requests_added) + ): + # Change sampling parameters to force different generation lengths. + request = requests[num_requests_added] + n = request.sampling_params.num_tokens_to_generate + request.sampling_params.num_tokens_to_generate = n + i + futures.append(client.add_request(request.prompt_text, request.sampling_params)) + num_requests_added += 1 + + if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles: + await suspend_resume_cycle(client, engine, args, futures) + cycles_done += 1 + next_suspend_at += args.suspend_resume_interval + + if num_requests_added == num_requests_total: + break + # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. + await asyncio.sleep(0) + + # While we wait for the requests to complete, the engine runs in the background. + results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures) + else: + # Non-rank-0: match the suspend/resume cycles that rank 0 drives. + for _ in range(num_suspend_resume_cycles): + await engine.wait_until(EngineState.PAUSED) + await engine.wait_until(EngineState.SUSPENDED) + await engine.wait_until(EngineState.RESUMED) + await engine.wait_until(EngineState.RUNNING) + + if dist.get_rank() == 0: + # Write results to JSON. Primarily used for functional testing. + if args.output_path: + json_results = {} + throughputs = [] + + for req in results: + result_dict = { + "input_prompt": req.prompt, + "generated_text": req.generated_text.replace("\n", "\\n"), + "generated_tokens": req.generated_tokens, + "latency": req.latency, # InferenceClient populates this field in the returned future. + } + if req.sampling_params.return_log_probs: + result_dict["logprobs"] = req.prompt_log_probs + req.generated_log_probs + throughput = len(req.generated_tokens) / req.latency + throughputs.append(throughput) + if req.routing_indices is not None: + result_dict["routing_indices"] = req.routing_indices.tolist() + + json_results[req.request_id] = result_dict + throughput_dict = {"throughput": throughputs} + if args.throughput_check_only: + json_results = throughput_dict + with open(args.output_path, "w") as fp: + json.dump(json_results, fp, indent=4) + else: + print("Results:") + unique_prompt_map = defaultdict(list) + for req in results: + unique_prompt_map[req.prompt].append(req) + for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()): + print( + f"%d/%d. prompt '%s' ... [%d] output '%s'." + % ( + idx, + len(unique_prompt_map), + prompt_text.replace("\n", "\\n"), + len(reqs), + reqs[0].generated_text.replace("\n", "\\n"), + ) + ) + + # Pause before stopping: STOP requires PAUSED or SUSPENDED state. + client.pause_engines() + + await engine.wait_until(EngineState.PAUSED) + + if dist.get_rank() == 0: + client.stop_engines() + + await engine.wait_until(EngineState.STOPPED) + + if dist.get_rank() == 0: + client.shutdown_coordinator() + client.stop() + logging.info(f"Rank: {dist.get_rank()} stopped their engine instance successfully.") + + +if __name__ == "__main__": + # enable inference mode in the very beginning as some fp8 optimizations + # check for it. + with torch.inference_mode(): + args = parse_and_validate_args( + extra_args_provider=add_inference_args, + args_defaults={'no_load_rng': True, 'no_load_optim': True}, + ) + initialize_megatron() + configure_nvtx_profiling(True) + + tokenizer = get_tokenizer() + + # Sampling params. + sampling_params = SamplingParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate, + termination_id=( + args.termination_id if args.termination_id is not None else tokenizer.eod + ), + ) + + model = get_model_for_inference() + + requests = build_requests(args, tokenizer, sampling_params) + + engine = get_dynamic_inference_engine(model=model) + + if dist.get_rank() == 0: + setup_prefix = build_dynamic_engine_setup_prefix(args, model, engine.context, requests) + print("~~~") + print(setup_prefix) + print("~~~") + + # Start Nsight profiler. + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStart() + + asyncio.run(main(engine, requests, args.inference_coordinator_port)) + + # Stop Nsight profiler. + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStop() diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py new file mode 100644 index 00000000000..d3dd619eaa1 --- /dev/null +++ b/examples/inference/gpt/gpt_static_inference.py @@ -0,0 +1,249 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +import os +import sys +import time +from argparse import Namespace + +from megatron.training.arguments import parse_and_validate_args +import torch + +from megatron.core.inference.contexts import StaticInferenceContext +from megatron.core.inference.engines import StaticInferenceEngine +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( + GPTInferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, +) +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.core.transformer.module import MegatronModule + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +import asyncio +import json +from typing import List + +from examples.inference.gpt.utils import build_requests +from megatron.inference.utils import add_inference_args, get_model_for_inference +from megatron.training import get_args, get_tokenizer, print_rank_0 +from megatron.training.initialize import initialize_megatron + + +def add_static_inference_args(parser): + """Static inference arguments.""" + + add_inference_args(parser) + + group = parser.add_argument_group(title='Static inference') + group.add_argument( + "--max-batch-size", + type=int, + default=None, + dest="max_batch_size", + help='Deprecated, use `--inference-max-requests` instead', + ) + group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens") + + return parser + + +def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInferenceEngine: + """Utility to get the relevant backend for running inference + + This function will automatically choose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. + + Args: + args (Namespace): The user arguments parsed from command line + model (MegatronModule): The megatron model . + + Returns: + AbstractBackend: The chosen backend + """ + tokenizer = build_tokenizer(args) + inference_context = StaticInferenceContext( + args.inference_max_requests, args.inference_max_seq_length + ) + inference_wrapped_model = GPTInferenceWrapper(model, inference_context) + text_generation_controller = TextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer + ) + engine_kwargs = { + "text_generation_controller": text_generation_controller, + "legacy": args.use_legacy_static_engine, + } + if not args.use_legacy_static_engine: + engine_kwargs["buffer_size_gb"] = args.inference_dynamic_batching_buffer_size_gb + return StaticInferenceEngine(**engine_kwargs) + + +async def generate( + inference_engine: StaticInferenceEngine, sampling_params: SamplingParams, prompts: List[str] +) -> List[InferenceRequest]: + async def collect_stream(prompt, request_id, stream_generator): + print(f"Request {request_id}: {prompt}", end="", flush=True) + prev_idx = 0 + async for output in stream_generator: + print(output.generated_text[prev_idx:], end="", flush=True) + prev_idx = len(output.generated_text) + print() + + request_ids: List[int] = [ + inference_engine.add_request(prompt=prompt, sampling_params=sampling_params, streaming=True) + for prompt in prompts + ] + stream_generators = [ + inference_engine.get_stream_generator(request_id) for request_id in request_ids + ] + + tasks = [ + asyncio.create_task(collect_stream(prompt, request_id, stream_generator)) + for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators) + ] + + await inference_engine.run_engine_async() + await asyncio.gather(*tasks) + + results: List[InferenceRequest] = [ + inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids + ] + + return results + + +@torch.inference_mode() +def main(): + """Main program.""" + + # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) + # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) + args = parse_and_validate_args( + extra_args_provider=add_static_inference_args, + args_defaults={ + 'no_load_rng': True, + 'no_load_optim': True, + 'micro_batch_size': 1, + 'exit_on_missing_checkpoint': True, + }, + ) + initialize_megatron() + + model = get_model_for_inference() + + inference_engine = get_inference_engine(args, model) + + sampling_params = SamplingParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate, + top_n_logprobs=args.top_n_logprobs, + ) + + # Build tokenizer + tokenizer = build_tokenizer(args) + + requests = build_requests(args, tokenizer) + prompts = [r.prompt_text for r in requests] + + if args.cuda_graph_impl == "local": + print(f"Running warmup for CUDA graphs...") + inference_engine.generate( + prompts=["warmup"], sampling_params=SamplingParams(num_tokens_to_generate=10) + ) + start_time = time.perf_counter() + if args.stream: + results: List[InferenceRequest] = asyncio.run( + generate(inference_engine, sampling_params, prompts) + ) + else: + results: List[InferenceRequest] = inference_engine.generate( + prompts=prompts, sampling_params=sampling_params + ) + end_time = time.perf_counter() + latency = end_time - start_time + + if torch.distributed.get_rank() == 0 and args.output_path: + results_output = {} + for idx, result in enumerate(results): + result_dict = { + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens': result.generated_tokens.tolist(), + 'tpot': result.tpot, + 'latency': latency, + } + if sampling_params.top_n_logprobs > 0: + result_dict['generated_top_n_logprobs'] = result.generated_top_n_logprobs + if sampling_params.return_log_probs: + response_logprobs = result.prompt_log_probs + result.generated_log_probs + result_dict["logprobs"] = response_logprobs + results_output[result.request_id] = result_dict + + with open(args.output_path, 'w') as f: + json.dump(results_output, f) + + # Print unique prompts + outputs. + if torch.distributed.get_rank() == 0: + + print("~~~~ Unique prompts + outputs. ~~~~") + + # Map results by their prompt. + from collections import defaultdict + + unique_prompt_map = defaultdict(list) + for result_idx, result in enumerate(results): + unique_prompt_map[result.prompt].append(result_idx) + + # Print unique prompts + outputs. + for unique_idx, (prompt_text, result_idxs) in enumerate(unique_prompt_map.items()): + result_idx = result_idxs[0] + result = results[result_idx] + generated_text = result.generated_text.replace("\n", "\\n") + print( + f"{unique_idx}/{len(unique_prompt_map)} [{len(result_idxs)}]. {prompt_text} " + f"... {generated_text}" + ) + + stats = torch.cuda.memory_stats() + print_rank_0( + "static | cg %d | %s | reqs %d [ batch %d ] ... mem %.1f/%.1f ... time %.3f." + % ( + args.cuda_graph_impl == "local", + ( + f"" + if args.prompts + else " %s, %d, %.1e, %.1e" + % ( + "(%s)" % " ".join(map(str, args.num_tokens_to_prompt)), + args.num_tokens_to_generate, + args.incoming_requests_duration, + args.incoming_requests_per_sec, + ) + ), + len(requests), + args.inference_max_requests, + stats["allocated_bytes.all.peak"] / (1024**3), + stats["reserved_bytes.all.peak"] / (1024**3), + latency, + ) + ) + # Force immediate process exit to bypass torchrun's atexit NCCL teardown when + # CUDA graphs have captured collectives (see PyTorch issue #115388). This can + # sometimes lead to hangs in the atexit handler. + # We do this only when CUDA graphs are enabled. + if args.cuda_graph_impl != "none": + print(f"[main] rank {torch.distributed.get_rank()}: finished", flush=True) + os._exit(0) + else: + torch.distributed.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py new file mode 100644 index 00000000000..c9b1c05c544 --- /dev/null +++ b/examples/inference/gpt/utils.py @@ -0,0 +1,326 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import copy +import itertools +import json +import random +import time +from argparse import ArgumentParser, Namespace +from functools import partial +from typing import Any, List, Optional + +import torch +from tqdm import tqdm + +from megatron.core.inference.contexts import DynamicInferenceContext +from megatron.core.inference.contexts.dynamic_context import get_mem_size_str +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.transformer.module import MegatronModule +from megatron.training import get_args + + +def get_default_sampling_params(termination_id: int = None): + return SamplingParams( + temperature=1.0, + top_k=1, + top_p=0.0, + return_log_probs=False, + num_tokens_to_generate=30, + termination_id=termination_id, + ) + + +def get_curr_time() -> float: + """Get synchronized time across ranks.""" + curr_time = torch.cuda.LongTensor([time.time_ns()]) + if torch.distributed.is_initialized(): + torch.distributed.broadcast(curr_time, src=0) + return curr_time.item() / 10**9 + + +class Request: + """Class to hold attributes for a single request. + + A request is initialized with its prompt text. As it is added, processed, + and completed through the inference engine, the request is populated with its + start time, end time, and output tokens. + + Args: + prompt_text (str): Prompt text. + time_offset (float): Artificial time offset for simulating incoming + requests. This value is later added to the `base_arrival_time` to + simulate the requests arrival time. + tokenizer (Any): Tokenizer for tokenizing the prompt. + """ + + def __init__( + self, + prompt_text: str, + time_offset: float, + tokenizer: Any, + sampling_params: SamplingParams = None, + ): + self.prompt_text = prompt_text + self.prompt_tokens = tokenizer.tokenize(prompt_text) + self.output_text = None + self.output_tokens = [] + self.time_offset = time_offset + self.time_arrival = None + self.time_start = None + self.time_end = None + self.ttft = None # Time-to-first-token in seconds + self.state = "not-started" + self.sampling_params: SamplingParams = ( + sampling_params + if sampling_params is not None + else get_default_sampling_params(tokenizer.eod) + ) + self.sampling_params = copy.deepcopy(self.sampling_params) + + def __str__(self) -> str: + return "state '%s'; toffset %.1e; prompt len %d; output len %d; '%s'" % ( + self.state, + self.time_offset, + len(self.prompt_tokens), + len(self.output_tokens), + self.prompt_text, + ) + + +def get_time_offsets( + seed: int | None, + incoming_requests_per_step: int, + incoming_requests_per_sec: float, + num_requests: int, +) -> list[float]: + """Get example time offsets.""" + + # Time offsets to add all requests at once. + if incoming_requests_per_step is not None or incoming_requests_per_sec <= 0: + return [-1] * num_requests + + # if num_requests is not None: + incoming_requests_duration = num_requests / incoming_requests_per_sec + incoming_requests_duration *= 2 # extra margin, to accomodate time sampling + + random.seed(seed) + + import simpy # Guard against this import in test case + + # Generate random time offsets. + def arrival(r): + while True: + yield env.timeout(random.expovariate(r)) + time_offsets.append(env.now) + + time_offsets = [] + env = simpy.Environment() + env.process(arrival(incoming_requests_per_sec)) + env.run(incoming_requests_duration) + + # Ensure at least a single request. + if len(time_offsets) == 0: + time_offsets = [0.0] + + # Ensure first time is 0. + time_offsets = [to - time_offsets[0] for to in time_offsets] + + # Truncate to num_requests. + assert len(time_offsets) >= num_requests + time_offsets = time_offsets[:num_requests] + + return time_offsets + + +def get_cli_requests( + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None +) -> list[Request]: + + # Get time offsets. + t_offsets = get_time_offsets( + args.seed, + args.incoming_requests_per_step, + args.incoming_requests_per_sec, + len(args.prompts), + ) + + # Init requests. + requests = [Request(p, t, tokenizer, sampling_params) for p, t in zip(args.prompts, t_offsets)] + return requests + + +def get_synthetic_requests( + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None +) -> list[Request]: + """Get example requests.""" + + # Get time offsets. + time_offsets = get_time_offsets( + args.seed, + args.incoming_requests_per_step, + args.incoming_requests_per_sec, + int(args.incoming_requests_per_sec * args.incoming_requests_duration), + ) + + # Build prompts with expected lengths. + assert ( + len(args.num_tokens_to_prompt) == 2 + and args.num_tokens_to_prompt[1] >= args.num_tokens_to_prompt[0] + ) + max_prompt_length = args.num_tokens_to_prompt[1] + max_prompt_text = "hi " * max_prompt_length + max_prompt_tokens = tokenizer.tokenize(max_prompt_text) + prompt_lengths = [random.randint(*args.num_tokens_to_prompt) for _ in time_offsets] + prompt_tokens_list = [max_prompt_tokens[:l] for l in prompt_lengths] + prompt_texts = [tokenizer.detokenize(tt) for tt in prompt_tokens_list] + + # Init requests. + assert len(prompt_texts) == len(time_offsets) + requests = [ + Request(t, o, tokenizer, sampling_params=sampling_params) + for t, o in zip(prompt_texts, time_offsets) + ] + + return requests + + +def get_requests_from_file( + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None +) -> list[Request]: + """Get requests from a file.""" + if not args.prompt_file: + raise ValueError("Prompt file is required to read requests from a file.") + + # Load prompts. + n_prompts = sum(1 for _ in open(args.prompt_file)) + prompts = [] + if sampling_params is None: + sampling_params = get_default_sampling_params(tokenizer.eod) + sampling_params_list = [] + with open(args.prompt_file) as f: + for line in tqdm(f.readlines(), "read prompt file", total=n_prompts): + line_dict = json.loads(line) + prompts.append(line_dict["text"]) + + sp = copy.deepcopy(sampling_params) + if args.num_tokens_from_file: + sp.num_tokens_to_generate = line_dict["chatgpt_output_token_length"] + sampling_params_list.append(sp) + + if len(prompts) == args.prompt_file_num_truncate: + break + + # Get time offsets. + time_offsets: list[float] = get_time_offsets( + args.seed, args.incoming_requests_per_step, args.incoming_requests_per_sec, len(prompts) + ) + + # Init requests. + requests = [ + Request(p, t, tokenizer, sp) + for p, t, sp in tqdm( + zip(prompts, time_offsets, sampling_params_list), "init requests", total=len(prompts) + ) + ] + + return requests + + +def build_requests( + args: Namespace, tokenizer: Any, sampling_params: Optional[SamplingParams] = None +) -> list[Request]: + # Check if we have any prompts (from command line or JSONL) + if args.prompts: + if args.prompt_file: + raise ValueError("Cannot use both --prompts and --prompt-file") + return get_cli_requests(args, tokenizer, sampling_params) + elif args.prompt_file: + return get_requests_from_file(args, tokenizer, sampling_params) + else: + return get_synthetic_requests(args, tokenizer, sampling_params) + + +def get_model_size_str(model): + n = sum(p.numel() for p in model.parameters()) + for exp, suffix in ((12, "t"), (9, "b"), (6, "m"), (3, "k"), (0, "")): + nquery = int(10**exp) + if n > nquery: + return "%d%s" % (n // nquery, suffix) + raise Exception("something went wrong.") + + +def build_dynamic_engine_setup_prefix( + args: Namespace, + model: MegatronModule, + context: DynamicInferenceContext, + requests: list[DynamicInferenceRequest], +): + """ + Returns a compact, pipe-separated summary of the dynamic-batching setup. + + Example output: + + `dynamic | cg True | prompts: synth(16 256), n 1024, g 512, t 1.0e+02 5.0e-01 | bf 4, 1.2 [r 1024, t 8192] | gtd 0.50 [r 512] | reqs 100` # pylint: disable=line-too-long + + Args: + args (Namespace): Command-line arguments for this run. + context (DynamicInferenceContext): Stores limits such as `max_requests`, + `max_tokens`, and `gtd_request_count`. + requests (List[DynamicInferenceRequest]): List of inference requests. + + Returns: + A configuration string for logging. + """ + # CUDA graph config + if args.cuda_graph_impl == "local": + cg_str = f"graphs {len(context.cuda_graph_batch_dimensions_list)}" + else: + cg_str = "--" + + # Unified memory (UVM). + uvm_str = f"uvm {int(context.unified_memory_level)}" + + # Prompt description + prompt_src_str = ( + "cli" + if args.prompts + else ( + "file" + if args.prompt_file + else f"synth({', '.join(map(str, args.num_tokens_to_prompt))})" + ) + ) + request_str = ( + f"requests: {prompt_src_str}, " f"n {len(requests):d}, g {args.num_tokens_to_generate:d}, " + ) + request_str += ( + f"dur {args.incoming_requests_duration:.1e} " f"r/sec {args.incoming_requests_per_sec:.1e}" + if args.incoming_requests_per_step is None + else f"r/step {args.incoming_requests_per_step}" + ) + + # Buffer limits config + buffer_limits_str = ( + f"bf: {get_mem_size_str(args.inference_dynamic_batching_buffer_size_gb*1024**3)}, " + f"{context.kv_block_allocator.active_count} chunks " + f"[r {context.max_requests}, t {context.max_tokens}]" + ) + + parts = [get_model_size_str(model), "dynamic", cg_str, uvm_str, request_str, buffer_limits_str] + + return " | ".join(parts) + + +def get_global_peak_memory_stats_bytes() -> dict: + """Peak allocated CUDA memory aggregated across ranks (MAX), in bytes. + + Uses `torch.cuda.max_memory_allocated()` and assumes peak stats were reset + before the benchmark run. + """ + peak_alloc = int(torch.cuda.max_memory_allocated()) + if torch.distributed.is_available() and torch.distributed.is_initialized(): + t = torch.tensor([peak_alloc], device="cuda", dtype=torch.int64) + torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) + peak_alloc = int(t[0].item()) + return {"mem-max-allocated-bytes": peak_alloc} diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py new file mode 100644 index 00000000000..9d8f4465f65 --- /dev/null +++ b/examples/inference/llama_mistral/huggingface_reference.py @@ -0,0 +1,25 @@ +import argparse +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +# Set up argument parsing +parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.") +parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation") +parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint") + +# Parse command-line arguments +args = parser.parse_args() + +model_path = args.model_path +prompt = args.prompt + +config = AutoConfig.from_pretrained(model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path, config=config) +model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda() + +inputs = tokenizer(prompt, return_tensors="pt") +for key in inputs: + inputs[key] = inputs[key].cuda() +# top_k, top_p and do_sample are set for greedy argmax based sampling + +outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) \ No newline at end of file diff --git a/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh b/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh new file mode 100755 index 00000000000..cc8cfac5e69 --- /dev/null +++ b/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh @@ -0,0 +1,68 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 + +DISTRIBUTED_ARGS="--nproc_per_node 8 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 0.0.0.0 \ + --master_port 6000" + +# Fill in checkpoint path to Llama 4 Scout to run +CHECKPOINT= +PROMPTS="What is the capital of France?" +TOKENS_TO_GENERATE=4 +MAX_BATCH_SIZE=2 + +MODEL_ARGS=" \ + --micro-batch-size 1 \ + --bf16 \ + --no-masked-softmax-fusion \ + --disable-bias-linear \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --no-rope-fusion \ + --normalization RMSNorm \ + --swiglu \ + --num-layers 48 \ + --hidden-size 5120 \ + --ffn-hidden-size 16384 \ + --num-attention-heads 40 \ + --group-query-attention \ + --num-query-groups 8 \ + --qk-layernorm \ + --num-experts 16 \ + --moe-ffn-hidden-size 8192 \ + --moe-router-score-function sigmoid \ + --moe-router-topk 1 \ + --moe-router-topk-scaling-factor 1.0 \ + --moe-shared-expert-intermediate-size 8192 \ + --moe-aux-loss-coeff 1e-3 \ + --moe-token-dispatcher-type alltoall \ + --moe-token-drop-policy probs \ + --moe-router-load-balancing-type seq_aux_loss \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --tokenizer-type HuggingFaceTokenizer \ + --make-vocab-size-divisible-by 128 \ + --use-mcore-models \ + --rotary-interleaved \ + --rotary-percent 1.0 \ + --rotary-base 500000 \ + --rope-scaling-factor 8.0 \ + --use-rope-scaling \ + --no-bias-swiglu-fusion \ + --qk-l2-norm \ + --moe-apply-probs-on-input \ + --moe-router-dtype fp64 \ +" + +torchrun $DISTRIBUTED_ARGS -m examples.inference.gpt.gpt_static_inference \ + --load ${CHECKPOINT} \ + --tokenizer-model unsloth/Llama-4-Scout-17B-16E-Instruct \ + --dist-ckpt-strictness log_unexpected \ + --tensor-model-parallel-size 8 \ + --prompts ${PROMPTS} \ + --num-tokens-to-generate ${TOKENS_TO_GENERATE} \ + --max-batch-size ${MAX_BATCH_SIZE} \ + ${MODEL_ARGS} diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh new file mode 100755 index 00000000000..06584f0917d --- /dev/null +++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# This example will start serving the Llama3.1-8B model +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 0.0.0.0 \ + --master_port 6000" + +# Ensure CHECKPOINT and TOKENIZER_MODEL are provided +if [ -z "$1" ] || [ -z "$2" ]; then + echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." + echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" + exit 1 +fi + +# Assign command-line arguments to variables +CHECKPOINT=$1 +TOKENIZER_MODEL=$2 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --use-checkpoint-args \ + --disable-bias-linear \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --transformer-impl transformer_engine \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 500000 \ + --use-rope-scaling \ + --use-rotary-position-embeddings \ + --swiglu \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --load ${CHECKPOINT} \ + --num-attention-heads 32 \ + --max-position-embeddings 131072 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 8192 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh new file mode 100755 index 00000000000..c5fc4103ab5 --- /dev/null +++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# This example will start serving the Llama3-8B model +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 0.0.0.0 \ + --master_port 6000" + +# Ensure CHECKPOINT and TOKENIZER_MODEL are provided +if [ -z "$1" ] || [ -z "$2" ]; then + echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." + echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" + exit 1 +fi + +# Assign command-line arguments to variables +CHECKPOINT=$1 +TOKENIZER_MODEL=$2 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --use-checkpoint-args \ + --disable-bias-linear \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --transformer-impl transformer_engine \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 500000 \ + --use-rotary-position-embeddings \ + --swiglu \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --load ${CHECKPOINT} \ + --num-attention-heads 32 \ + --max-position-embeddings 8192 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 8192 diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh new file mode 100755 index 00000000000..4358fd494c7 --- /dev/null +++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# This example will start serving the Mistral-7B-v0.3 model +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 0.0.0.0 \ + --master_port 6000" + +# Ensure CHECKPOINT and TOKENIZER_MODEL are provided +if [ -z "$1" ] || [ -z "$2" ]; then + echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." + echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" + exit 1 +fi + +# Assign command-line arguments to variables +CHECKPOINT=$1 +TOKENIZER_MODEL=$2 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --use-checkpoint-args \ + --apply-layernorm-1p \ + --transformer-impl transformer_engine \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --use-flash-attn \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --ffn-hidden-size 14336 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --load ${CHECKPOINT} \ + --num-attention-heads 32 \ + --max-position-embeddings 4096 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 4096 \ + --seed 101 diff --git a/examples/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh similarity index 92% rename from examples/run_text_generation_server_345M.sh rename to examples/inference/run_text_generation_server_345M.sh index a151b984676..e8e61adb163 100755 --- a/examples/run_text_generation_server_345M.sh +++ b/examples/inference/run_text_generation_server_345M.sh @@ -26,9 +26,6 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ --fp16 \ --micro-batch-size 1 \ --seq-length 1024 \ - --out-seq-length 1024 \ - --temperature 1.0 \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ - --top_p 0.9 \ --seed 42 diff --git a/examples/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh similarity index 92% rename from examples/run_text_generation_server_345M_8_tensor_parallel.sh rename to examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh index 027ab421727..368cec3b312 100755 --- a/examples/run_text_generation_server_345M_8_tensor_parallel.sh +++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh @@ -24,9 +24,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_s --fp16 \ --micro-batch-size 1 \ --seq-length 1024 \ - --out-seq-length 1024 \ - --temperature 1.0 \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ - --top_p 0.9 \ --seed 42 diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py new file mode 100644 index 00000000000..1aca74b3176 --- /dev/null +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -0,0 +1,162 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +import os +import sys +from argparse import Namespace + +import torch + +import pretrain_t5 +from megatron.core.inference.engines import AbstractEngine, StaticInferenceEngine +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( + T5InferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import ( + EncoderDecoderTextGenerationController, +) +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.core.transformer.module import MegatronModule +from pretrain_t5 import model_provider + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +from typing import List + +from megatron.core import mpu +from megatron.training import get_args, get_model, get_tokenizer +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron + + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument( + "--return-log-probs", + action='store_true', + default=False, + help='Return the log probabilities of the final output tokens', + ) + group.add_argument( + "--num-tokens-to-generate", + type=int, + default=30, + help='Number of tokens to generate for each prompt', + ) + group.add_argument( + "--encoder-prompts", + metavar='N', + type=str, + nargs='+', + help='Encoder input prompts with each prompt within quotes and separated by space', + ) + group.add_argument( + "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once' + ) + return parser + + +def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine: + """Utility to get the relevant backend for running inference + + This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. + + Args: + args (Namespace): The user arguments parsed from command line + model (MegatronModule): The megatron model . + + Returns: + AbstractBackend: The chosen backend + """ + # Build tokenizer + tokenizer = build_tokenizer(args) + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + ) + + inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config) + text_generation_controller = EncoderDecoderTextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer + ) + return StaticInferenceEngine( + text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size + ) + + +def main(): + """Main program.""" + + # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) + # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) + initialize_megatron( + extra_args_provider=add_text_generate_args, + args_defaults={ + 'no_load_rng': True, + 'no_load_optim': True, + 'micro_batch_size': 1, + 'exit_on_missing_checkpoint': True, + }, + ) + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + load_checkpoint(model, None, None) + model = model[0] + + args = get_args() + + inference_engine = get_inference_engine(args, model) + + sampling_params = SamplingParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate, + ) + + # Build tokenizer + tokenizer = build_tokenizer(args) + + decoder_prompts = [""] * len( + args.encoder_prompts + ) # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty + args.prompts = decoder_prompts + + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, + add_BOS=True, + encoder_prompts=args.encoder_prompts, + sampling_params=sampling_params, + ) + + if torch.distributed.get_rank() == 0: + for idx, result in enumerate(results): + print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens': result.generated_tokens, + } + print(result) + + +if __name__ == "__main__": + main() diff --git a/examples/llama/README.md b/examples/llama/README.md new file mode 100644 index 00000000000..9872185ab2f --- /dev/null +++ b/examples/llama/README.md @@ -0,0 +1,144 @@ +# Llama Models + +## Table of contents +- [1. Overview](#1-overview) +- [2. Prerequisites](#2-prerequisites) +- [3. Training Setup](#3-training-setup) +- [4. Configuration](#4-configuration) +- [5. Test Datasets](#5-test-datasets) +- [6. FP8 Debugging](#6-fp8-debugging) + +## 1. Overview + + +Train Llama models using FP8 precision with Megatron-Core. + +## 2. Prerequisites + + +```bash +# Clone repository +export HOST_MEGATRON_LM_DIR="/path/to/your/host/megatron-lm" +git clone https://github.com/NVIDIA/Megatron-LM.git "$HOST_MEGATRON_LM_DIR" +cd "$HOST_MEGATRON_LM_DIR" +git checkout "core_r0.12.0" + +# Set paths +export HOST_CHECKPOINT_PATH="./checkpoints/llama3_8b_fp8" +export HOST_TENSORBOARD_LOGS_PATH="./tensorboard_logs/llama3_8b_fp8" + +# Optional: For real data +# export HOST_TOKENIZER_MODEL_PATH="/path/to/host/tokenizer.model" +# export HOST_DATA_PREFIX="/path/to/host/mydata_prefix" +``` + +## 3. Training Setup + + +### Using Mock Data +```bash +PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.03-py3" + +docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \ + -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \ + -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \ + -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \ + --workdir /workspace/megatron-lm \ + $PYTORCH_IMAGE \ + bash examples/llama/train_llama3_8b_h100_fp8.sh \ + /workspace/checkpoints \ + /workspace/tensorboard_logs \ + 2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_mock_$(date +'%y-%m-%d_%H-%M-%S').log" +``` + +### Using Custom Data and Tokenizer +```bash +PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.03-py3" + +docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \ + -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \ + -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \ + -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \ + -v "${HOST_TOKENIZER_MODEL_PATH}:/workspace/tokenizer_model" \ + -v "$(dirname "${HOST_DATA_PREFIX}"):/workspace/data_dir" \ + --workdir /workspace/megatron-lm \ + $PYTORCH_IMAGE \ + bash examples/llama/train_llama3_8b_h100_fp8.sh \ + /workspace/checkpoints \ + /workspace/tensorboard_logs \ + /workspace/tokenizer_model \ + "/workspace/data_dir/$(basename "${HOST_DATA_PREFIX}")" \ + 2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_custom_$(date +'%y-%m-%d_%H-%M-%S').log" +``` + +## 4. Configuration + + +Default parallelism strategy: +- Tensor Parallel: 1 +- Pipeline Parallel: 1 +- Context Parallel: 2 + +Llama-3-8B architecture: +- 32 layers +- Hidden size: 4096 +- FFN hidden size: 14336 +- Attention heads: 32 +- Query groups: 8 +- Sequence length: 8192 +- RMSNorm normalization with SwiGLU and RoPE + +Key training parameters: +- Micro-batch size: 1 +- Global batch size: 128 +- Learning rate: 1.5e-4 +- Min learning rate: 1.0e-5 +- Weight decay: 0.1 +- FP8 format: hybrid + +You can modify these parameters directly in the `train_llama3_8b_h100_fp8.sh` script. + +This configuration follows those defined in NeMo Framework's performance scripts, which can be found at [https://github.com/NVIDIA/NeMo/tree/main/scripts/performance](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance). + +### FP8 Performance + +| Model | #-GPUs | GBS | MBS | Seq Length | TP | PP | CP | VP | EP | GA | Tokens/sec/GPU | TFLOP/sec/GPU | +|-------|--------|-----|-----|------------|----|----|----|----|----|----|----------------|---------------| +| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 1 | 32 | 13812 | 800 | +| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 8 | 1 | 5 | 1 | 64 | 1621 | 780 | +| LLAMA3-405B | 1024 | 512 | 1 | 8192 | 8 | 8 | 2 | 8 | 1 | 64 | 315 | 834 | + +Legend: +- GBS: Global Batch Size +- MBS: Micro Batch Size +- TP: Tensor Parallel size +- PP: Pipeline Parallel size +- CP: Context Parallel size +- VP: Virtual Pipeline stages +- EP: Expert Parallel size +- GA: Gradient Accumulation steps + +As NeMo uses Megatron-Core, for the latest performance benchmarks, please refer to the official [NeMo documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-summary.html). + +## 5. Test Datasets + + +Recommended datasets: +1. **WikiText-103**: https://huggingface.co/datasets/Salesforce/wikitext + +Preprocess datasets: +```bash +python "${HOST_MEGATRON_LM_DIR}/tools/preprocess_data.py" \ + --input your_dataset.json \ + --output-prefix test_dataset \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model /path/to/tokenizer.model \ + --append-eod +``` + +## 6. FP8 Training Considerations + + +- **Hardware**: Requires NVIDIA Hopper, Ada, or Blackwell GPUs for FP8 support + +- **Troubleshooting**: If you encounter NaN values or instability with FP8 training, please refer to [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). diff --git a/examples/llama/train_llama3_8b_h100_fp8.sh b/examples/llama/train_llama3_8b_h100_fp8.sh new file mode 100644 index 00000000000..28227546bc7 --- /dev/null +++ b/examples/llama/train_llama3_8b_h100_fp8.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +# Environment variables for performance tuning +export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} +#export LOG_LEVEL=${LOG_LEVEL:-INFO} +#export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-19} +#export NVTE_FWD_LAYERNORM_SM_MARGIN=${NVTE_FWD_LAYERNORM_SM_MARGIN:-16} +#export NVTE_BWD_LAYERNORM_SM_MARGIN=${NVTE_BWD_LAYERNORM_SM_MARGIN:-16} +#export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-2097152} +#export NCCL_AVOID_RECORD_STREAMS=${NCCL_AVOID_RECORD_STREAMS:-1} + +CHECKPOINT_PATH=${1:-"checkpoints/llama3_8b_fp8"} +TENSORBOARD_LOGS_PATH=${2:-"tensorboard_logs/llama3_8b_fp8"} +TOKENIZER_ARG=${3:-"MOCK"} # Path to tokenizer model, or "MOCK" +DATA_ARG=${4:-"MOCK"} # Data prefix, or "MOCK" + +# Create directories if they don't exist +mkdir -p "$(dirname "$CHECKPOINT_PATH")" +mkdir -p "$(dirname "$TENSORBOARD_LOGS_PATH")" + +# Distributed training setup +GPUS_PER_NODE=8 +NUM_NODES=1 +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-6000} +NODE_RANK=${NODE_RANK:-0} +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +# Path to the pretrain_gpt.py script, assuming this script is run from the root of the Megatron-LM repository +PRETRAIN_SCRIPT_PATH="pretrain_gpt.py" + +# Fixed model and training parameters +TP_SIZE=1 +CP_SIZE=1 +PP_SIZE=1 +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=128 +NUM_LAYERS=32 +DTYPE="fp8" +SEQ_LENGTH=8192 +MAX_POSITION_EMBEDDINGS=8192 + +# Data cache path (useful for both mock and real data) +DATA_CACHE_PATH="${PWD}/benchmark_cache_llama3_8b_fp8" +mkdir -p "$DATA_CACHE_PATH" + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --node_rank $NODE_RANK + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +MODEL_ARGS=( + --use-mcore-models + --num-layers $NUM_LAYERS + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --kv-channels 128 + --seq-length $SEQ_LENGTH + --max-position-embeddings $MAX_POSITION_EMBEDDINGS + --position-embedding-type rope + --rotary-base 1000000 + --rotary-percent 1.0 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --swiglu + --normalization RMSNorm + --init-method-std 0.0134 + --attention-backend fused + --apply-layernorm-1p + --untie-embeddings-and-output-weights + --disable-bias-linear +) + +TRAINING_ARGS=( + --micro-batch-size $MICRO_BATCH_SIZE + --global-batch-size $GLOBAL_BATCH_SIZE + --train-samples 1953125000 + --lr-decay-samples 1949218748 + --lr-warmup-samples 3906252 + --lr 0.00015 + --min-lr 0.00001 + --decoupled-lr 5.0e-4 # Specific to decoupled AdamW, ensure optimizer is compatible + --decoupled-min-lr 4.5e-5 # Specific to decoupled AdamW + --lr-decay-style cosine + --clip-grad 1.0 + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --bf16 + --grad-reduce-in-bf16 + --cross-entropy-loss-fusion + --calculate-per-token-loss + --manual-gc + --empty-unused-memory-level 1 + --exit-duration-in-mins 235 +) + +# Conditional arguments based on DTYPE (FP8) +DTYPE_ARGS=() +if [[ "$DTYPE" == "fp8" ]]; then + DTYPE_ARGS+=( + "--fp8-format hybrid" + "--fp8-amax-history-len 1024" + "--fp8-amax-compute-algo max" + "--fp8-param-gather" + ) +fi + +# Model parallelism arguments +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size $TP_SIZE + --context-parallel-size $CP_SIZE + # --pipeline-model-parallel-size $PP_SIZE # Not explicitly set in llama script options, assume 1 if not multi-node PP + --sequence-parallel # Always enable sequence parallelism with TP_SIZE=2 +) + +# Distributed Data Parallel (DDP) arguments +# From original script's ddp_args +DDP_ARGS=( + --use-distributed-optimizer + --overlap-grad-reduce + --overlap-param-gather +) +TRAINING_ARGS+=("${DDP_ARGS[@]}") + + +# Data arguments (conditional for mock vs real data) +DATA_ARGS_LIST=() +if [[ "$TOKENIZER_ARG" == "MOCK" ]] || [[ "$DATA_ARG" == "MOCK" ]] || [[ -z "$TOKENIZER_ARG" ]]; then + DATA_ARGS_LIST+=( + "--mock-data" + "--tokenizer-type NullTokenizer" + "--vocab-size 128256" + "--data-cache-path ${DATA_CACHE_PATH}" + "--tiktoken-pattern v2" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + ) +else + # Settings for real data + DATA_ARGS_LIST+=( + "--data-path $DATA_ARG" + "--tokenizer-type HuggingFaceTokenizer" + "--tokenizer-model $TOKENIZER_ARG" + "--data-cache-path ${DATA_CACHE_PATH}" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + # Note: --vocab-size might be inferred by HuggingFaceTokenizer or might need to be explicit. + "--vocab-size 128256" + ) +fi + +EVAL_AND_LOGGING_ARGS=( + --log-interval 1 + --eval-iters 32 + --eval-interval 100 + --save-interval 1000 + --log-throughput + --profile + --profile-step-start 4 + --profile-step-end 6 + --ckpt-format torch_dist + --distributed-timeout-minutes 60 + --save "$CHECKPOINT_PATH" + --load "$CHECKPOINT_PATH" + --tensorboard-dir "$TENSORBOARD_LOGS_PATH" +) + +# Ensure pretrain_gpt.py is found +if [ ! -f "$PRETRAIN_SCRIPT_PATH" ]; then + echo "Error: pretrain_gpt.py not found at $PRETRAIN_SCRIPT_PATH" + echo "Please ensure you are running this script from the root of the Megatron-LM repository, and pretrain_gpt.py is present." + exit 1 +fi + +# Run the training command +torchrun ${DISTRIBUTED_ARGS[@]} \ + "$PRETRAIN_SCRIPT_PATH" \ + ${MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${DTYPE_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS_LIST[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} + +set +x \ No newline at end of file diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore new file mode 100644 index 00000000000..940f4797e4b --- /dev/null +++ b/examples/mamba/.gitignore @@ -0,0 +1,4 @@ +checkpoints/ +data-cache/ +tensorboard/ +triton-cache/ diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile new file mode 100644 index 00000000000..2e194095b75 --- /dev/null +++ b/examples/mamba/Dockerfile @@ -0,0 +1,32 @@ +FROM nvcr.io/nvidia/pytorch:24.01-py3 + +RUN pip uninstall -y triton && \ + pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful + +# The causal-conv1d and mamba-ssm packages below are built from scratch here +# (which takes significant time) because there are no wheels available on PyPI +# for these relatively newer versions of the packages that are compatible with +# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we +# are using (in the NGC base container). Generally, if the package is not +# compatible with the PyTorch version, then it will generate a Python import +# error. The package authors tend to only release wheels for new versions of +# these pacakges which are compatible with the versions of regular PyTorch and +# NGC-variant PyTorch that are newer at the time of release. So, to use newer +# versions of these packages with relatively older versions of the NGC PyTorch +# container, we tend to have to build the packages from scratch. + +RUN cd /tmp && \ + git clone https://github.com/Dao-AILab/causal-conv1d.git && \ + cd causal-conv1d && \ + git checkout v1.2.2.post1 && \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \ + cd .. && \ + rm -rf causal-conv1d + +RUN cd /tmp && \ + git clone https://github.com/state-spaces/mamba.git && \ + cd mamba && \ + git checkout v2.0.3 && \ + MAMBA_FORCE_BUILD=TRUE pip install . && \ + cd .. && \ + rm -rf mamba diff --git a/examples/mamba/README.md b/examples/mamba/README.md new file mode 100644 index 00000000000..ce60f119ea5 --- /dev/null +++ b/examples/mamba/README.md @@ -0,0 +1,117 @@ +# Mamba-based Language Models + +## Introduction + +This document is an entrypoint into the code used for +[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887). + +We are releasing the parameters for some of the models described in that +technical report via +[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c). +The code in the `main` branch is no longer compatible with the `Mamba2-*` +checkpoints. You can load them using the +[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba). + +## Installation + +Create and run a Docker container using the [Dockerfile](./Dockerfile). + +``` +docker build -t your_image_name:your_tag . +docker run --gpus all -it --rm \ + -v /path/to/megatron:/workspace/megatron \ + -v /path/to/dataset:/workspace/dataset \ + -v /path/to/checkpoints:/workspace/checkpoints \ + -w /workspace/megatron/examples/mamba \ + your_image_name:your_tag +``` + +## Train + +[`train.sh`](./train.sh) is an example pretraining script, showing how to run on +a single node. Select between 800M-scale and 8B-scale models by setting the +`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as +the one described in the technical report. + +## Text Generation + +Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text +generation server using an 8B hybrid checkpoint. This is configured to run the +8B hybrid model described in the technical report, with tensor model parallel +set to 1. + +The arguments in the script will need to be changed if using a checkpoint with a +different model parallel configuration or other differences, such as model +architecture. For example, to run the 8B pure Mamba-2 model, change +`--hybrid-layer-pattern` to use only `M` symbols (e.g., 56 `M`s for the 8B +model), or remove it entirely. + +Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start +a text generation server using the 8B reference Transformer checkpoint. + +## Checkpoint Formats + +For inference, the model must be configured to match the checkpoint file used, +including the hybrid layer configuration and model parallel configuration. + +If you need to convert a hybrid checkpoint file to a different tensor parallel +or pipeline parallel size, use +[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py). +There is an example run command at the end of that file. + +Before running that script, you will need to set `PYTHONPATH` to include the +root directory of your Megatron-LM repository clone. + +``` +export PYTHONPATH=:PYTHONPATH +``` + +## Hybrid Options + +`--hybrid-layer-pattern PATTERN` specifies the layer type for every layer in +the model using a string of single-character symbols: + +* `M` — Mamba layer +* `*` — Attention layer +* `-` — MLP layer +* `E` — MoE layer + +The number of layers is derived from the pattern length, so `--num-layers` +should not be specified when `--hybrid-layer-pattern` is used. + +For example, the 8B hybrid model described in the technical report uses: + +``` +--hybrid-layer-pattern "M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-" +``` + +This is a 56-layer model with 4 attention layers, 28 MLP layers, and 24 Mamba +layers. + +A pure Mamba model uses only `M` symbols (e.g., `MMMMMMMM` for 8 layers). +A pure transformer model uses only `*` and `-` symbols. + +### Pipeline parallelism + +Use `|` to define pipeline stage boundaries for flexible virtual pipeline +parallelism (fVPP). For example, `M-M-|M-M*-|M-M-|M-M*-` defines 4 pipeline +segments. The number of segments must be evenly divisible by +`--pipeline-model-parallel-size`. + +### Multi-Token Prediction (MTP) + +Use `/` to append MTP layer patterns. Each pattern after the separator +represents one MTP prediction depth. For example, `M*M*/MM/MM` has main +pattern `M*M*` with MTP pattern `MM` repeated for 2 depths. + +### Deprecated options + +`--hybrid-override-pattern`, `--hybrid-attention-ratio`, and +`--hybrid-mlp-ratio` are deprecated. Use `--hybrid-layer-pattern` instead. + +## Mamba vs Mamba-2 + +This codebase currently only supports Mamba-2, and not the original version of +Mamba. However, the +[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba) +can be configured to run the original version of Mamba. diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh new file mode 100755 index 00000000000..f183dea4ad1 --- /dev/null +++ b/examples/mamba/run_text_gen_server_8b.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Use: ./run_text_gen_server_8b.sh +# To launch the client: python ../../tools/text_generation_cli.py + +CHECKPOINT_PATH=$1 +TOKENIZER_PATH=$2 + +HYBRID_LAYER_PATTERN="M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-" + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +export TRITON_CACHE_DIR="./triton-cache/" +export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" + +torchrun $DISTRIBUTED_ARGS ../../tools/run_hybrid_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --untie-embeddings-and-output-weights \ + --hybrid-layer-pattern ${HYBRID_LAYER_PATTERN} \ + --hidden-size 4096 \ + --load ${CHECKPOINT_PATH} \ + --num-attention-heads 32 \ + --group-query-attention \ + --num-query-groups 8 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --normalization RMSNorm \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type none \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --distributed-timeout-minutes 1440 \ + --bf16 \ + --micro-batch-size 1 \ + --use-mcore-models \ + --spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \ + --seed 42 diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh new file mode 100644 index 00000000000..5413b245ed3 --- /dev/null +++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Use: ./run_text_gen_server_8b_gpt3.sh +# To launch the client: python ../../tools/text_generation_cli.py + +CHECKPOINT_PATH=$1 +TOKENIZER_PATH=$2 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --use-flash-attn \ + --apply-layernorm-1p \ + --untie-embeddings-and-output-weights \ + --num-layers 32 \ + --hidden-size 4096 \ + --load ${CHECKPOINT_PATH} \ + --num-attention-heads 32 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type rope \ + --rotary-percent 0.5 \ + --squared-relu \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --distributed-timeout-minutes 1440 \ + --bf16 \ + --micro-batch-size 1 \ + --use-mcore-models \ + --transformer-impl local \ + --seed 42 diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh new file mode 100755 index 00000000000..f971242ff0b --- /dev/null +++ b/examples/mamba/train.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Use: ./train.sh + +MODEL_SCALE="800M" # or "8B" + +case "${MODEL_SCALE}" in + "800M") + TENSOR_MODEL_PARALLEL_SIZE=1 + HYBRID_LAYER_PATTERN="M-M-M--M-*M-M-M-M--*M-M-M-M-*M--M-M-M-*M-M--M-M-" + HIDDEN_SIZE=1024 + NUM_ATTENTION_HEADS=16 + GLOBAL_BATCH_SIZE=32 + ;; + "8B") + TENSOR_MODEL_PARALLEL_SIZE=4 + HYBRID_LAYER_PATTERN="M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-" + HIDDEN_SIZE=4096 + NUM_ATTENTION_HEADS=32 + GLOBAL_BATCH_SIZE=8 + ;; + *) + echo "Invalid version specified" + exit 1 + ;; +esac + +DATA_PATH=$1 +TOKENIZER_PATH=$2 + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_IB_TIMEOUT=19 +export NCCL_IB_QPS_PER_CONNECTION=4 + +CHECKPOINT_DIR="./checkpoints" +DATACACHE_DIR="./data-cache" +TENSORBOARD_DIR="./tensorboard" + +mkdir -p ${CHECKPOINT_DIR} +mkdir -p ${DATACACHE_DIR} +mkdir -p ${TENSORBOARD_DIR} + +export TRITON_CACHE_DIR="./triton-cache/" +export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" + +SEQ_LEN=4096 +TRAIN_SAMPLES=73242188 # 300B tokens / 4096 +LR_WARMUP_SAMPLES=50000 +LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES + +options=" \ + --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \ + --sequence-parallel \ + --pipeline-model-parallel-size 1 \ + --use-distributed-optimizer \ + --overlap-param-gather \ + --overlap-grad-reduce \ + --untie-embeddings-and-output-weights \ + --init-method-std 0.02 \ + --position-embedding-type none \ + --hybrid-layer-pattern ${HYBRID_LAYER_PATTERN} \ + --hidden-size ${HIDDEN_SIZE} \ + --num-attention-heads ${NUM_ATTENTION_HEADS} \ + --group-query-attention \ + --num-query-groups 8 \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --train-samples ${TRAIN_SAMPLES} \ + --lr-warmup-samples ${LR_WARMUP_SAMPLES} \ + --lr-decay-samples ${LR_DECAY_SAMPLES} \ + --save ${CHECKPOINT_DIR} \ + --load ${CHECKPOINT_DIR} \ + --data-path ${DATA_PATH} \ + --data-cache-path ${DATACACHE_DIR} \ + --split 99,1,0 \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model ${TOKENIZER_PATH} \ + --distributed-backend nccl \ + --micro-batch-size 4 \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --lr 2.5e-4 \ + --min-lr 2.5e-5 \ + --lr-decay-style cosine \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --disable-bias-linear \ + --normalization RMSNorm \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 10 \ + --save-interval 2000 \ + --eval-interval 2000 \ + --eval-iters 32 \ + --bf16 \ + --use-mcore-models \ + --spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \ + --no-create-attention-mask-in-dataloader \ + --tensorboard-dir ${TENSORBOARD_DIR}" + +torchrun --nproc_per_node 8 ../../pretrain_hybrid.py ${options} diff --git a/examples/megatron_fsdp/README.md b/examples/megatron_fsdp/README.md new file mode 100644 index 00000000000..eaf5eca1364 --- /dev/null +++ b/examples/megatron_fsdp/README.md @@ -0,0 +1,157 @@ +# Megatron-FSDP Examples + +Example scripts for training and checkpoint conversion using [Megatron-FSDP](../../docs/user-guide/features/megatron_fsdp.md). These demonstrate recommended configurations for Llama 3 8B and DeepSeek-V3 671B models, as well as checkpoint format conversion between `torch_dist` (N-D parallel) and `fsdp_dtensor` formats. + +## Scripts + +### `train_llama3_8b_fsdp_h100_fp8.sh` + +Single-node training script for **Llama 3 8B** using Megatron-FSDP with FP8 precision on H100 GPUs. Uses `torchrun` for local distributed training and supports both mock data (for benchmarking) and real datasets. + +#### Usage + +Run from the root of the Megatron-LM repository: + +```bash +# With mock data (default, for benchmarking) +bash examples/megatron_fsdp/train_llama3_8b_fsdp_h100_fp8.sh + +# With real data +bash examples/megatron_fsdp/train_llama3_8b_fsdp_h100_fp8.sh \ + checkpoints/llama3_8b_fsdp_fp8 \ + tensorboard_logs/llama3_8b_fsdp_fp8 \ + /path/to/tokenizer \ + /path/to/data_prefix +``` + +| Positional Argument | Default | Description | +|---------------------|---------|-------------| +| `$1` — Checkpoint path | `checkpoints/llama3_8b_fsdp_fp8` | Directory for saving and loading checkpoints. | +| `$2` — TensorBoard path | `tensorboard_logs/llama3_8b_fsdp_fp8` | Directory for TensorBoard logs. | +| `$3` — Tokenizer | `MOCK` | Path to a tokenizer model, or `MOCK` for `NullTokenizer`. | +| `$4` — Data path | `MOCK` | Data prefix for training data, or `MOCK` for mock data. | + +#### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `USE_MEGATRON_FSDP` | `1` | Set to `1` to enable Megatron-FSDP. Set to `0` to train with standard DDP. | +| `SHARDING_STRATEGY` | `optim_grads_params` | FSDP sharding strategy (ZeRO-3). Options: `no_shard`, `optim`, `optim_grads`, `optim_grads_params`. | +| `OUTER_SHARDING_STRATEGY` | `no_shard` | DP-Outer sharding strategy for HSDP/HFSDP. Options: `no_shard`, `optim`. | +| `MASTER_ADDR` | `localhost` | Master node address for distributed training. | +| `MASTER_PORT` | `6000` | Master node port. | +| `NODE_RANK` | `0` | Rank of the current node. | + +#### Configuration Summary + +- **Model**: Llama 3 8B (GQA with 32 heads / 8 KV groups, RoPE, SwiGLU, RMSNorm) +- **Parallelism**: TP=1, CP=1, PP=1, 8 GPUs per node, FSDP ZeRO-3 +- **Precision**: FP8 (hybrid format) with BF16 training and BF16 gradient reduction +- **Batch size**: micro-batch=1, global-batch=128, sequence length=8192 +- **Optimizations**: NCCL user buffers, FSDP double buffering, manual registration, meta-device initialization, per-token loss, overlapped grad-reduce and param-gather + +--- + +### `sbatch_mfsdp_deepseek_v3.sh` + +Multi-node SLURM training script for **DeepSeek-V3** (671B MoE) using Megatron-FSDP. Submits an `sbatch` job with containerized execution via `srun`. + +#### Usage + +Set the required configuration variables and submit: + +```bash +export MEGATRON_PATH=/path/to/Megatron-LM +export CONTAINER_IMAGE=/path/to/container.sqsh # or docker image URL +export OUTPUT_PATH=/path/to/output +export DATA_PATH=/path/to/training/data + +bash examples/megatron_fsdp/sbatch_mfsdp_deepseek_v3.sh +``` + +Before running, update the `#SBATCH` directives and `--container-mounts` in the script to match your cluster configuration. + +#### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEGATRON_PATH` | *(required)* | Path to the Megatron-LM repository. | +| `CONTAINER_IMAGE` | *(required)* | Container image (`.sqsh` file or Docker URL). | +| `OUTPUT_PATH` | *(required)* | Base directory for checkpoints, TensorBoard logs, SLURM logs, and Nsight profiles. | +| `DATA_PATH` | *(required)* | Training data prefix path. | +| `USE_MEGATRON_FSDP` | `1` | Enable Megatron-FSDP. Set to `0` for standard DDP. | +| `SHARDING_STRATEGY` | `optim_grads_params` | FSDP sharding strategy (ZeRO-3). | +| `TP` | `1` | Tensor parallel size. | +| `EP` | `8` | Expert parallel size. | +| `MBS` | `4` | Micro-batch size. | +| `GBS` | `2048` | Global batch size. | +| `PROFILE` | `0` | Set to `1` to enable Nsight Systems profiling (steps 10–12). | +| `WANDB` | `1` | Set to `1` to enable Weights & Biases logging. Requires `WANDB_API_KEY`. | +| `COMMENT` | N/A | Tag appended to W&B experiment names and Nsight profile filenames. | + +#### Configuration Summary + +- **Model**: DeepSeek-V3 (61 layers, 256 routed experts, top-8 routing, Multi-Latent Attention, MTP) +- **Parallelism**: TP=1, EP=8, CP=1, FSDP ZeRO-3 +- **Precision**: BF16 +- **MoE**: Flex dispatcher with HybridEP backend, grouped GEMM, sigmoid routing with expert bias, auxiliary sequence loss +- **Recomputation**: Selective recomputation of `mlp`, `moe`, `mla_up_proj`, and `layernorm` modules +- **Optimizations**: NCCL user buffers, FSDP double buffering, meta-device initialization, per-token loss, overlapped grad-reduce and param-gather +- **Tokenizer**: `deepseek-ai/DeepSeek-V3` via HuggingFace + +--- + +### `sbatch_checkpoint_convert.sh` + +SLURM batch script for converting checkpoints from **`torch_dist`** (N-D parallel) format to **`fsdp_dtensor`** (Megatron-FSDP) format. This enables resuming training under Megatron-FSDP from checkpoints originally saved with tensor/pipeline/expert parallelism. + +#### Prerequisites + +Before converting, you need a `param_to_param_group_map.json` file. Generate it by running a `torch_dist` training job with the `--dump-param-to-param-group-map` flag, then converting the output: + +```bash +# 1. Run a training job (or trivial experiment) with the dump flag +--dump-param-to-param-group-map /path/to/param_to_param_group_map + +# 2. Convert the dumped map to JSON +python tools/checkpoint/checkpoint_inspector.py \ + print-torch-dcp-in-json /path/to/param_to_param_group_map +``` + +See the [Checkpoint Conversion](../../docs/user-guide/features/megatron_fsdp.md#checkpoint-conversion) section in the Megatron-FSDP docs for details. + +#### Usage + +Set the required configuration variables, update the checkpoint paths in `RUN_CMD`, and submit: + +```bash +export MEGATRON_PATH=/path/to/Megatron-LM +export CONTAINER_IMAGE=/path/to/container.sqsh +export OUTPUT_PATH=/path/to/output + +bash examples/megatron_fsdp/sbatch_checkpoint_convert.sh +``` + +Before running, you must edit the script to fill in: +- The input `torch_dist` checkpoint path +- The output `fsdp_dtensor` checkpoint path +- The path to `param_to_param_group_map.json` +- The `#SBATCH` directives and `--container-mounts` for your cluster + +#### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEGATRON_PATH` | *(required)* | Path to the Megatron-LM repository. | +| `CONTAINER_IMAGE` | *(required)* | Container image (`.sqsh` file or Docker URL). | +| `OUTPUT_PATH` | *(required)* | Base directory for SLURM logs. | + +#### Conversion Command + +The script runs `checkpoint_inspector.py convert-torch-dist-to-fsdp-dtensor` with the `--swiglu` flag (for models using SwiGLU activations). Remove `--swiglu` if converting a non-SwiGLU model. + +## Further Reading + +- [Megatron-FSDP User Guide](../../docs/user-guide/features/megatron_fsdp.md) — full feature guide, API reference, and sharding strategy documentation. +- [Megatron-FSDP on PyPI](https://pypi.org/project/megatron-fsdp/) — standalone `fully_shard` API. +- [Megatron-FSDP Source](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/distributed/fsdp/src) — implementation source code. diff --git a/examples/megatron_fsdp/sbatch_checkpoint_convert.sh b/examples/megatron_fsdp/sbatch_checkpoint_convert.sh new file mode 100644 index 00000000000..9f302c93f8f --- /dev/null +++ b/examples/megatron_fsdp/sbatch_checkpoint_convert.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Configuration: Set these paths before running the script +MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository +CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url +OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for SLURM logs + +# Checkpoint conversion command +# Note: Update the checkpoint paths in the command below +RUN_CMD=" +cd ${MEGATRON_PATH}; +git rev-parse HEAD; +export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH}; +python3 tools/checkpoint/checkpoint_inspector.py \ + convert-torch-dist-to-fsdp-dtensor --swiglu \ + your_own_path_to_input_torch_dist_checkpoint \ + your_own_path_to_output_fsdp_dtensor_checkpoint \ + --param-to-param-group-map-json your_own_path_to_param_to_param_group_map.json" + +# SLURM settings +SLURM_LOGS="${OUTPUT_PATH}/slurm_logs" +mkdir -p ${SLURM_LOGS} || { + echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}" + exit 1 +} + +# Submit SLURM job +# Note: Update SBATCH parameters below according to your cluster configuration +set +e +sbatch <&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log + +EOF +set -e diff --git a/examples/megatron_fsdp/sbatch_mfsdp_deepseek_v3.sh b/examples/megatron_fsdp/sbatch_mfsdp_deepseek_v3.sh new file mode 100644 index 00000000000..22a8f22f68c --- /dev/null +++ b/examples/megatron_fsdp/sbatch_mfsdp_deepseek_v3.sh @@ -0,0 +1,223 @@ +#!/bin/bash + +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export PYTHONWARNINGS=ignore +export TRITON_CACHE_DIR=/tmp/triton_cache_$SLURM_NODEID + +# Configuration: Set these variables before running the script +MEGATRON_PATH=${MEGATRON_PATH:-"your_own_megatron_path"} # Path to Megatron-LM repository +CONTAINER_IMAGE=${CONTAINER_IMAGE:-"your_own_container_image"} # Path to .sqsh or docker image url +OUTPUT_PATH=${OUTPUT_PATH:-"your_own_output_path"} # Path for output logs and checkpoints +DATA_PATH=${DATA_PATH:-"your_own_data_path"} +USE_MEGATRON_FSDP=${USE_MEGATRON_FSDP:-1} +SHARDING_STRATEGY=${SHARDING_STRATEGY:-"optim_grads_params"} +PROFILE=${PROFILE:-0} +WANDB=${WANDB:-1} + +TP=${TP:-1} +EP=${EP:-8} +MBS=${MBS:-4} +GBS=${GBS:-2048} +COMMENT=${COMMENT:-""} + +PRETRAIN_ARGS=( + --distributed-timeout-minutes 60 + --tensor-model-parallel-size ${TP} + --expert-model-parallel-size ${EP} + --expert-tensor-parallel-size 1 + --context-parallel-size 1 + --use-distributed-optimizer + --overlap-grad-reduce + --overlap-param-gather + --use-mcore-models + --sequence-parallel + --use-flash-attn + --disable-bias-linear + --micro-batch-size ${MBS} + --global-batch-size ${GBS} + --train-samples 585937500 + --exit-duration-in-mins 220 + --no-check-for-nan-in-loss-and-grad + --manual-gc + --manual-gc-interval 10 + --recompute-granularity selective + --recompute-modules mlp moe mla_up_proj layernorm + --transformer-impl transformer_engine + --seq-length 4096 + --data-cache-path ${OUTPUT_PATH}/cache + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model deepseek-ai/DeepSeek-V3 + --data-path ${DATA_PATH} + --split 99,1,0 + --no-mmap-bin-files + --no-create-attention-mask-in-dataloader + --num-workers 6 + --num-layers 61 + --hidden-size 7168 + --ffn-hidden-size 18432 + --num-attention-heads 128 + --kv-channels 128 + --max-position-embeddings 4096 + --position-embedding-type rope + --rotary-base 10000 + --make-vocab-size-divisible-by 3232 + --normalization RMSNorm + --norm-epsilon 1e-6 + --swiglu + --untie-embeddings-and-output-weights + --multi-latent-attention + --attention-dropout 0.0 + --hidden-dropout 0.0 + --clip-grad 1.0 + --weight-decay 0.1 + --qk-layernorm + --lr-decay-samples 584765624 + --lr-warmup-samples 1536000 + --lr-warmup-init 3.9e-7 + --lr 3.9e-6 + --min-lr 3.9e-7 + --lr-decay-style cosine + --adam-beta1 0.9 + --adam-beta2 0.95 + --num-experts 256 + --moe-layer-freq [0]*3+[1]*58 + --moe-ffn-hidden-size 2048 + --moe-shared-expert-intermediate-size 2048 + --moe-router-load-balancing-type seq_aux_loss + --moe-router-topk 8 + --moe-token-dispatcher-type flex + --moe-flex-dispatcher-backend hybridep + --moe-router-pre-softmax + --moe-grouped-gemm + --moe-aux-loss-coeff 1e-4 + --moe-router-group-topk 4 + --moe-router-num-groups 8 + --moe-router-topk-scaling-factor 2.5 + --moe-router-score-function sigmoid + --moe-router-enable-expert-bias + --moe-router-bias-update-rate 1e-3 + --moe-router-dtype fp32 + --moe-permute-fusion + --moe-router-force-load-balancing + --q-lora-rank 1536 + --kv-lora-rank 512 + --qk-head-dim 128 + --qk-pos-emb-head-dim 64 + --v-head-dim 128 + --rotary-scaling-factor 40 + --mscale 1.0 + --mscale-all-dim 1.0 + --mtp-num-layers 1 + --mtp-loss-scaling-factor 0.1 + --eval-iters 32 + --eval-interval 100 + --auto-detect-ckpt-format + --load ${OUTPUT_PATH}/checkpoints + --save ${OUTPUT_PATH}/checkpoints + --save-interval 100 + --dist-ckpt-strictness log_all + --init-method-std 0.02 + --log-timers-to-tensorboard + --log-memory-to-tensorboard + --log-num-zeros-in-grad + --log-params-norm + --log-validation-ppl-to-tensorboard + --log-throughput + --log-interval 1 + --logging-level 40 + --tensorboard-dir ${OUTPUT_PATH}/tensorboard + --bf16 + --enable-experimental +) + +if [ "${USE_MEGATRON_FSDP}" = 1 ]; then + unset CUDA_DEVICE_MAX_CONNECTIONS + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --use-megatron-fsdp + --data-parallel-sharding-strategy ${SHARDING_STRATEGY} + --no-gradient-accumulation-fusion + --use-distributed-optimizer + --calculate-per-token-loss + --init-model-with-meta-device + --ckpt-format fsdp_dtensor + --grad-reduce-in-bf16 + --fsdp-double-buffer + --use-nccl-ub + ) +fi + +# Profiling command +if [ "${PROFILE}" = 1 ]; then + PROFILE_CMD="nsys profile --sample=none --cpuctxsw=none --trace=cuda,nvtx,cublas,cudnn \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --cuda-graph-trace=node \ + --cuda-memory-usage=true \ + -f true -x true \ + -o ${OUTPUT_PATH}/nsys/Megatron-FSDP-Deepseek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT}" + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --profile + --profile-step-start 10 + --profile-step-end 12 + --profile-ranks 0 + ) + echo "PROFILE_CMD=" + echo $PROFILE_CMD +else + PROFILE_CMD="" +fi + +if [ "${WANDB}" = 1 ]; then + export WANDB_API_KEY=${WANDB_API_KEY:-"your_own_wandb_api_key"} + PRETRAIN_ARGS=( + "${PRETRAIN_ARGS[@]}" + --wandb-project your_own_wandb_project + --wandb-exp-name DeepSeek-V3-TP${TP}EP${EP}-MBS${MBS}GBS${GBS}-${COMMENT} + ) +fi + +TRAINING_CMD=" +cd ${MEGATRON_PATH}; +git rev-parse HEAD; +export PYTHONPATH=${MEGATRON_PATH}:${PYTHONPATH}; +${PROFILE_CMD} python ${MEGATRON_PATH}/pretrain_gpt.py ${PRETRAIN_ARGS[@]}" + +# SLURM settings +SLURM_LOGS="${OUTPUT_PATH}/slurm_logs" +mkdir -p ${SLURM_LOGS} || { + echo "Error: Failed to create SLURM logs directory ${SLURM_LOGS}" + exit 1 +} + +# Submit SLURM job +# Note: Update SBATCH parameters below according to your cluster configuration +set +e +sbatch <&1 | tee ${SLURM_LOGS}/\${SLURM_JOB_ID}.log + +EOF +set -e diff --git a/examples/megatron_fsdp/train_llama3_8b_fsdp_h100_fp8.sh b/examples/megatron_fsdp/train_llama3_8b_fsdp_h100_fp8.sh new file mode 100644 index 00000000000..ddd3f160fa7 --- /dev/null +++ b/examples/megatron_fsdp/train_llama3_8b_fsdp_h100_fp8.sh @@ -0,0 +1,212 @@ +#!/bin/bash + +CHECKPOINT_PATH=${1:-"checkpoints/llama3_8b_fsdp_fp8"} +TENSORBOARD_LOGS_PATH=${2:-"tensorboard_logs/llama3_8b_fsdp_fp8"} +TOKENIZER_ARG=${3:-"MOCK"} # Path to tokenizer model, or "MOCK" +DATA_ARG=${4:-"MOCK"} # Data prefix, or "MOCK" + +# Create directories if they don't exist +mkdir -p "$(dirname "$CHECKPOINT_PATH")" +mkdir -p "$(dirname "$TENSORBOARD_LOGS_PATH")" + +# Distributed training setup +GPUS_PER_NODE=8 +NUM_NODES=1 +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-6000} +NODE_RANK=${NODE_RANK:-0} +WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) + +# Path to the pretrain_gpt.py script, assuming this script +# is run from the root of the Megatron-LM repository. +PRETRAIN_SCRIPT_PATH="pretrain_gpt.py" + +# Model & Training Parameters +USE_MEGATRON_FSDP=${USE_MEGATRON_FSDP:-1} +SHARDING_STRATEGY=${SHARDING_STRATEGY:-"optim_grads_params"} +OUTER_SHARDING_STRATEGY=${OUTER_SHARDING_STRATEGY:-"no_shard"} +TP_SIZE=1 +CP_SIZE=1 +PP_SIZE=1 +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=128 +NUM_LAYERS=32 +DTYPE="fp8" +SEQ_LENGTH=8192 +MAX_POSITION_EMBEDDINGS=8192 + +# Data cache path (useful for both mock and real data) +DATA_CACHE_PATH="${PWD}/benchmark_cache_llama3_8b_fsdp_fp8" +mkdir -p "$DATA_CACHE_PATH" + +DISTRIBUTED_ARGS=( + --nproc_per_node $GPUS_PER_NODE + --nnodes $NUM_NODES + --node_rank $NODE_RANK + --master_addr $MASTER_ADDR + --master_port $MASTER_PORT +) + +MODEL_ARGS=( + --use-mcore-models + --num-layers $NUM_LAYERS + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --kv-channels 128 + --seq-length $SEQ_LENGTH + --max-position-embeddings $MAX_POSITION_EMBEDDINGS + --position-embedding-type rope + --rotary-base 1000000 + --rotary-percent 1.0 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --swiglu + --normalization RMSNorm + --init-method-std 0.0134 + --attention-backend fused + --apply-layernorm-1p + --untie-embeddings-and-output-weights + --disable-bias-linear +) + +TRAINING_ARGS=( + --micro-batch-size $MICRO_BATCH_SIZE + --global-batch-size $GLOBAL_BATCH_SIZE + --train-samples 1953125000 + --lr-decay-samples 1949218748 + --lr-warmup-samples 3906252 + --lr 0.00015 + --min-lr 0.00001 + --decoupled-lr 5.0e-4 + --decoupled-min-lr 4.5e-5 + --lr-decay-style cosine + --clip-grad 1.0 + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --bf16 + --cross-entropy-loss-fusion + --manual-gc + --empty-unused-memory-level 1 + --exit-duration-in-mins 235 +) + +if [ "${USE_MEGATRON_FSDP}" = 1 ]; then + unset CUDA_DEVICE_MAX_CONNECTIONS + TRAINING_ARGS=( + "${TRAINING_ARGS[@]}" + --use-megatron-fsdp + --data-parallel-sharding-strategy ${SHARDING_STRATEGY} + --no-gradient-accumulation-fusion + --calculate-per-token-loss + --init-model-with-meta-device + --ckpt-format fsdp_dtensor + --grad-reduce-in-bf16 + --use-nccl-ub + --fsdp-double-buffer + --fsdp-manual-registration + # To enable HFSDP, DP full-sharding of the optimizer state with + # hierarchical data parallelism (DP-Outer=2, DP-Inner=DP//2)... + # --num-distributed-optimizer-instances 2 + # --outer-dp-sharding-strategy ${OUTER_SHARDING_STRATEGY} + # To further customize Megatron-FSDP data precision... + # --megatron-fsdp-main-params-dtype fp32 + # --megatron-fsdp-main-grads-dtype auto + # --megatron-fsdp-grad-comm-dtype auto + ) +fi + +# Conditional arguments based on DTYPE (FP8) +DTYPE_ARGS=() +if [[ "$DTYPE" == "fp8" ]]; then + DTYPE_ARGS+=( + "--fp8-format hybrid" + "--fp8-amax-history-len 1024" + "--fp8-amax-compute-algo max" + "--fp8-param-gather" + ) +fi + +# Model parallelism arguments +MODEL_PARALLEL_ARGS=( + --tensor-model-parallel-size $TP_SIZE + --context-parallel-size $CP_SIZE + --sequence-parallel +) + +# Distributed Data Parallel (DDP) arguments +# From original script's ddp_args +DDP_ARGS=( + --use-distributed-optimizer + --overlap-grad-reduce + --overlap-param-gather +) +TRAINING_ARGS+=("${DDP_ARGS[@]}") + + +# Data arguments (conditional for mock vs real data) +DATA_ARGS_LIST=() +if [[ "$TOKENIZER_ARG" == "MOCK" ]] || [[ "$DATA_ARG" == "MOCK" ]] || [[ -z "$TOKENIZER_ARG" ]]; then + DATA_ARGS_LIST+=( + "--mock-data" + "--tokenizer-type NullTokenizer" + "--vocab-size 128256" + "--data-cache-path ${DATA_CACHE_PATH}" + "--tiktoken-pattern v2" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + ) +else + # Settings for real data + DATA_ARGS_LIST+=( + "--data-path $DATA_ARG" + "--tokenizer-type HuggingFaceTokenizer" + "--tokenizer-model $TOKENIZER_ARG" + "--data-cache-path ${DATA_CACHE_PATH}" + "--split '99,1,0'" + "--no-create-attention-mask-in-dataloader" + "--no-mmap-bin-files" + "--num-workers 1" + # Note: --vocab-size might be inferred by HuggingFaceTokenizer or might need to be explicit. + "--vocab-size 128256" + ) +fi + +EVAL_AND_LOGGING_ARGS=( + --log-interval 1 + --eval-iters 32 + --eval-interval 100 + --save-interval 1000 + --log-throughput + --profile + --profile-step-start 4 + --profile-step-end 6 + --distributed-timeout-minutes 60 + --save "$CHECKPOINT_PATH" + --load "$CHECKPOINT_PATH" + --tensorboard-dir "$TENSORBOARD_LOGS_PATH" +) + +# Ensure pretrain_gpt.py is found +if [ ! -f "$PRETRAIN_SCRIPT_PATH" ]; then + echo "Error: pretrain_gpt.py not found at $PRETRAIN_SCRIPT_PATH" + echo "Please ensure you are running this script from the root of the Megatron-LM repository, and pretrain_gpt.py is present." + exit 1 +fi + +# Run the training command +torchrun ${DISTRIBUTED_ARGS[@]} \ + "$PRETRAIN_SCRIPT_PATH" \ + ${MODEL_ARGS[@]} \ + ${TRAINING_ARGS[@]} \ + ${DTYPE_ARGS[@]} \ + ${MODEL_PARALLEL_ARGS[@]} \ + ${DATA_ARGS_LIST[@]} \ + ${EVAL_AND_LOGGING_ARGS[@]} + +set +x \ No newline at end of file diff --git a/examples/merge_mp_bert.sh b/examples/merge_mp_bert.sh deleted file mode 100755 index 1383433284b..00000000000 --- a/examples/merge_mp_bert.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -TENSOR_MODEL_PARALLEL_SIZE=2 - -VOCAB_FILE=bert-vocab.txt -CHECKPOINT_PATH=checkpoints/bert_345m - -WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ - --model-type BERT \ - --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ - --tokenizer-type BertWordPieceLowerCase \ - --vocab-file $VOCAB_FILE \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --seq-length 512 \ - --max-position-embeddings 512 \ - --load $CHECKPOINT_PATH diff --git a/examples/mimo/__init__.py b/examples/mimo/__init__.py new file mode 100644 index 00000000000..0519ecba6ea --- /dev/null +++ b/examples/mimo/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/mimo/avlm_inference.py b/examples/mimo/avlm_inference.py new file mode 100644 index 00000000000..14ca2068cac --- /dev/null +++ b/examples/mimo/avlm_inference.py @@ -0,0 +1,244 @@ +import argparse +import os +from pathlib import Path +from typing import Union + +# hf path +import requests +import torch +from PIL import Image +from transformers import AutoProcessor +from transformers import AutoTokenizer +import soundfile as sf +import io +import numpy as np +import scipy.signal as signal + +from examples.mimo.model_providers.llava_avlm import model_provider_llava_avlm +from megatron.core import dist_checkpointing, parallel_state, tensor_parallel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.training import print_rank_0 +from examples.mimo.data.utils.calculate_audio_tokens import calculate_num_audio_tokens + +def init_distributed(tp_size: int = 1, pp_size: int = 1): + if torch.distributed.is_initialized(): + return + rank = int(os.environ.get("LOCAL_RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + torch.cuda.set_device(rank % torch.cuda.device_count()) + torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size) + parallel_state.initialize_model_parallel(tp_size, pp_size) + +def get_input_data( + processor: AutoProcessor, + image_processor: AutoProcessor, + audio_processor: AutoProcessor, + audio_path: str, + image_path: str, + prompt: str, + device: Union[int, str] = 0): + """ + Prepare inputs for the MIMO model forward pass. + """ + + def read_audio(audio_path): + """Process audio file and return tensor.""" + with open(audio_path, 'rb') as f: + audio_bytes = f.read() + audio_io = io.BytesIO(audio_bytes) + waveform, sample_rate = sf.read(audio_io) + + # Resample if needed + fixed_sample_rate = 16000 + if sample_rate != fixed_sample_rate: + num_samples = int(len(waveform) * fixed_sample_rate / sample_rate) + waveform = signal.resample(waveform, num_samples) + + # Convert to tensor + audio_tensor = torch.from_numpy(waveform).float() + return audio_tensor + + def read_image(image_path): + """Process image file and return tensor.""" + with open(image_path, 'rb') as f: + image_bytes = f.read() + image_io = io.BytesIO(image_bytes) + image = Image.open(image_io) + image_tensor = torch.from_numpy(np.array(image)).permute(2, 0, 1) # Convert to CxHxW format + image_tensor = image_tensor.float() / 255.0 # rescale to [0,1] range + return image_tensor + + + # read audio and image + audio_tensor = read_audio(audio_path) + image_tensor = read_image(image_path) + + # set up prompt + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + # process audio + processed_audios = audio_processor(audio_tensor, sampling_rate=16000) + processed_audios = torch.tensor(processed_audios["input_features"]) + processed_audios = processed_audios.squeeze(0) # remove batch dim + num_audio_tokens = calculate_num_audio_tokens(audio_tensor.unsqueeze(0), "openai/whisper-base") + audios_seq_lengths = torch.tensor(num_audio_tokens) + prompt = prompt.replace("