pytorch
diff --git a/‎.bazelrc‎
Lines changed: 1 addition & 1 deletion b/‎.bazelrc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 6 additions & 2 deletions b/‎.ci/docker/build.sh‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎.ci/docker/ci_commit_pins/nccl.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/nccl.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/ci_commit_pins/triton-xpu.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/triton-xpu.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 4 additions & 4 deletions b/‎.ci/docker/common/install_cuda.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.ci/docker/requirements-docs.txt‎
Lines changed: 2 additions & 2 deletions b/‎.ci/docker/requirements-docs.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/lumen_cli/cli/lib/core/torchtitan/torchtitan_test.py‎
Lines changed: 2 additions & 1 deletion b/‎.ci/lumen_cli/cli/lib/core/torchtitan/torchtitan_test.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/magma/Makefile‎
Lines changed: 7 additions & 0 deletions b/‎.ci/magma/Makefile‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.ci/pytorch/common_utils.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/pytorch/common_utils.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/pytorch/smoke_test/smoke_test.py‎
Lines changed: 86 additions & 0 deletions b/‎.ci/pytorch/smoke_test/smoke_test.py‎
Lines changed: 86 additions & 0 deletions
@@ -1,4 +1,4 @@
-build --cxxopt=--std=c++17
+build --cxxopt=--std=c++20
 build --copt=-I.
 # Bazel does not support including its cc_library targets as system
 # headers. We work around this for generated code
 
@@ -195,12 +195,16 @@ case "$tag" in
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-client | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=13
     VISION=yes
     XPU_VERSION=2025.3
-    XPU_DRIVER_TYPE=LTS
+    if [[ $tag =~ "client" ]]; then
+      XPU_DRIVER_TYPE=CLIENT
+    else
+      XPU_DRIVER_TYPE=LTS
+    fi
     NINJA_VERSION=1.9.0
     TRITON=yes
     if [[ $tag =~ "benchmarks" ]]; then
 
@@ -1 +1 @@
-v2.29.3-1
+v2.29.7-1
@@ -1 +1 @@
-307748db7742a0f8259a7ea0336909eb55d2051a
+33f782efa9464adebb448ea1f1df1a64ec37ceb0
@@ -111,7 +111,7 @@ function install_126 {
 }
 
 function install_129 {
-  CUDNN_VERSION=9.17.1.4
+  CUDNN_VERSION=9.20.0.48
   echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
   # install CUDA 12.9.1 in the same container
   install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
@@ -129,7 +129,7 @@ function install_129 {
 }
 
 function install_128 {
-  CUDNN_VERSION=9.19.0.56
+  CUDNN_VERSION=9.20.0.48
   echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
   # install CUDA 12.8.1 in the same container
   install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
@@ -147,7 +147,7 @@ function install_128 {
 }
 
 function install_130 {
-  CUDNN_VERSION=9.19.0.56
+  CUDNN_VERSION=9.20.0.48
   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.8.0"
   # install CUDA 13.0 in the same container
   install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
@@ -165,7 +165,7 @@ function install_130 {
 }
 
 function install_132 {
-  CUDNN_VERSION=9.19.0.56
+  CUDNN_VERSION=9.20.0.48
   echo "Installing CUDA 13.2 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.8.0"
   # install CUDA 13.2 in the same container
   install_cuda 13.2.0 cuda_13.2.0_595.45.04_linux
 
@@ -2,9 +2,9 @@ sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 7.2.6
 
-pytorch_sphinx_theme2==0.4.3
+pytorch_sphinx_theme2==0.4.6
 #Description: This is needed to generate PyTorch docs
-#Pinned versions: 0.4.3
+#Pinned versions: 0.4.6
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 
@@ -21,11 +21,12 @@ def __init__(self, args: Any):
 
     def prepare(self):
         clone_torchtitan(dst=self.work_directory)
-        # torchao nightly is required by torchtitan
+        # torchao and torchcomms nightlies are required by torchtitan
         pip_install_packages(
             packages=[
                 "--pre",
                 "torchao",
+                "torchcomms",
                 "--index-url",
                 "https://download.pytorch.org/whl/nightly/cu129",
             ],
 
@@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh
 
 .PHONY: all
+all: magma-cuda132
 all: magma-cuda130
 all: magma-cuda129
 all: magma-cuda128
@@ -26,6 +27,12 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+.PHONY: magma-cuda132
+magma-cuda132: DESIRED_CUDA := 13.2
+magma-cuda132: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+magma-cuda132:
+	$(DOCKER_RUN)
+
 .PHONY: magma-cuda130
 magma-cuda130: DESIRED_CUDA := 13.0
 magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 
@@ -320,7 +320,7 @@ function install_flash_attn_cute() {
   git checkout "${flash_attn_commit}"
 
   # Install only the 'cute' sub-directory
-  pip_install -e flash_attn/cute/
+  pip_install flash_attn/cute/
   popd
 
   # remove the local repo
@@ -367,7 +367,7 @@ function install_cutlass_api() {
   git checkout "${cutlass_commit}"
 
   # Install cutlass_api with torch extras
-  pip_install -e "python/cutlass_api[torch]"
+  pip_install "python/cutlass_api[torch]"
   popd
 
   rm -rf cutlass-build
 
@@ -25,6 +25,7 @@
 package_type = os.getenv("MATRIX_PACKAGE_TYPE")
 target_os = os.getenv("TARGET_OS", sys.platform)
 BASE_DIR = Path(__file__).parent.parent.parent
+PYTORCH_ROOT = BASE_DIR.parent
 
 is_cuda_system = gpu_arch_type == "cuda"
 NIGHTLY_ALLOWED_DELTA = 3
@@ -216,6 +217,89 @@ def find_pypi_package_version(package: str) -> str | None:
     return None
 
 
+def get_expected_cudnn_version_linux(cuda_version: str) -> str | None:
+    """Parse expected cuDNN version from generate_binary_build_matrix.py for Linux.
+
+    Reads PYTORCH_EXTRA_INSTALL_REQUIREMENTS and extracts the cudnn version
+    for the given CUDA version (e.g. "12.6").
+    """
+    matrix_script = (
+        PYTORCH_ROOT / ".github" / "scripts" / "generate_binary_build_matrix.py"
+    )
+    if not matrix_script.exists():
+        print(f"Warning: {matrix_script} not found, skipping cuDNN version check")
+        return None
+
+    content = matrix_script.read_text()
+    # Match the full cudnn package version like nvidia-cudnn-cu12==9.10.2.21
+    # and extract major.minor.patch (dropping the build number)
+    pattern = (
+        rf'"{re.escape(cuda_version)}":\s*\(\s*'
+        r"[\s\S]*?nvidia-cudnn-cu\d+==(\d+\.\d+\.\d+)\.\d+"
+    )
+    match = re.search(pattern, content)
+    if match:
+        return match.group(1)
+    return None
+
+
+def get_expected_cudnn_version_windows(cuda_version: str) -> str | None:
+    """Parse expected cuDNN version from cuda_install.bat for Windows.
+
+    Reads the batch file and extracts EXPECTED_CUDNN_VERSION for the given
+    CUDA version (e.g. "12.6" maps to CUDA_VER 126).
+    """
+    bat_file = (
+        PYTORCH_ROOT / ".ci" / "pytorch" / "windows" / "internal" / "cuda_install.bat"
+    )
+    if not bat_file.exists():
+        print(f"Warning: {bat_file} not found, skipping cuDNN version check")
+        return None
+
+    content = bat_file.read_text()
+    # Convert "12.6" to "126" to match batch file's CUDA_VER format
+    cuda_ver_nodot = cuda_version.replace(".", "")
+    # Match: if %CUDA_VER% EQU 126 ( ... set EXPECTED_CUDNN_VERSION=9.10.2 )
+    pattern = (
+        rf"if %CUDA_VER% EQU {re.escape(cuda_ver_nodot)}\s*\("
+        r"[\s\S]*?set EXPECTED_CUDNN_VERSION=(\d+\.\d+\.\d+)"
+    )
+    match = re.search(pattern, content)
+    if match:
+        return match.group(1)
+    return None
+
+
+def check_cudnn_version(cuda_version: str, actual_cudnn_version: str) -> None:
+    """Validate cuDNN version matches expected version from build config files."""
+    if sys.platform in ["linux", "linux2"]:
+        expected = get_expected_cudnn_version_linux(cuda_version)
+        source = "generate_binary_build_matrix.py"
+    elif sys.platform == "win32":
+        expected = get_expected_cudnn_version_windows(cuda_version)
+        source = "cuda_install.bat"
+    else:
+        print(f"cuDNN version check not supported on platform {sys.platform}")
+        return
+
+    if expected is None:
+        print(
+            f"Warning: Could not determine expected cuDNN version for CUDA {cuda_version} "
+            f"from {source}, skipping validation"
+        )
+        return
+
+    if not actual_cudnn_version.startswith(expected):
+        raise RuntimeError(
+            f"cuDNN version mismatch for CUDA {cuda_version}. "
+            f"Loaded: {actual_cudnn_version} Expected: {expected} (from {source})"
+        )
+    print(
+        f"cuDNN version check passed: {actual_cudnn_version} matches "
+        f"expected {expected} from {source}"
+    )
+
+
 def cudnn_to_version_str(cudnn_version: int) -> str:
     patch = int(cudnn_version % 10)
     minor = int((cudnn_version / 100) % 100)
@@ -294,6 +378,8 @@ def smoke_test_cuda(
                 f"Expected: {torch_cudnn_compile_version}"
             )
 
+        check_cudnn_version(gpu_arch_ver, torch_cudnn_version)
+
         if sys.platform in ["linux", "linux2"]:
             torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
             print(f"Torch nccl; version: {torch_nccl_version}")
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-build --cxxopt=--std=c++17`
	`1`	`+build --cxxopt=--std=c++20`
`2`	`2`	`build --copt=-I.`
`3`	`3`	`# Bazel does not support including its cc_library targets as system`
`4`	`4`	`# headers. We work around this for generated code`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-307748db7742a0f8259a7ea0336909eb55d2051a`
	`1`	`+33f782efa9464adebb448ea1f1df1a64ec37ceb0`