From ec73fbe3a9fc00bb0c20e45aab40d4c98cb7505e Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 7 May 2026 17:05:44 -0700
Subject: [PATCH 1/4] Disable the RHT fusion for non-SM100 family devices

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
---
 transformer_engine/pytorch/csrc/quantizer.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index da91e5c170..bd5e2695d1 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -2243,7 +2243,11 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
 
   // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT
   bool eligible_for_rht_cast_fusion =
-      input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0;
+      input.dtype() == DType::kBFloat16 &&
+      rows % 64 == 0 &&
+      cols % 128 == 0 &&
+      transformer_engine::cuda::sm_arch() >= 100 &&
+      transformer_engine::cuda::sm_arch() <= 110;
 
   // Stochastic rounding
   // When both rowwise and columnwise quantization are used with RHT,

From d1848a841241507f8b36188742bd7f023c1d0070 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 00:09:59 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/pytorch/csrc/quantizer.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index bd5e2695d1..ec464b79bc 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -2243,11 +2243,8 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
 
   // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT
   bool eligible_for_rht_cast_fusion =
-      input.dtype() == DType::kBFloat16 &&
-      rows % 64 == 0 &&
-      cols % 128 == 0 &&
-      transformer_engine::cuda::sm_arch() >= 100 &&
-      transformer_engine::cuda::sm_arch() <= 110;
+      input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0 &&
+      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() <= 110;
 
   // Stochastic rounding
   // When both rowwise and columnwise quantization are used with RHT,

From 1425a8d9844a688273fb7467e86a98e3ac8de8f1 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 11 May 2026 16:40:29 -0700
Subject: [PATCH 3/4] Fix the compilation error

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
---
 transformer_engine/pytorch/csrc/quantizer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index ec464b79bc..b2334283cd 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -8,6 +8,7 @@
 
 #include "common.h"
 #include "common/util/system.h"
+#include "common/util/cuda_runtime.h"
 #include "pybind.h"
 #include "torch/torch.h"
 

From 44ca5ea0cdf6c6c53308a80c455a597b37f4ff6b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 May 2026 23:41:27 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/pytorch/csrc/quantizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index b2334283cd..75e28c64ac 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -7,8 +7,8 @@
 #include <pybind.h>
 
 #include "common.h"
-#include "common/util/system.h"
 #include "common/util/cuda_runtime.h"
+#include "common/util/system.h"
 #include "pybind.h"
 #include "torch/torch.h"