From ec73fbe3a9fc00bb0c20e45aab40d4c98cb7505e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 7 May 2026 17:05:44 -0700 Subject: [PATCH 1/4] Disable the RHT fusion for non-SM100 family devices Signed-off-by: Przemek Tredak --- transformer_engine/pytorch/csrc/quantizer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp index da91e5c170..bd5e2695d1 100644 --- a/transformer_engine/pytorch/csrc/quantizer.cpp +++ b/transformer_engine/pytorch/csrc/quantizer.cpp @@ -2243,7 +2243,11 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT bool eligible_for_rht_cast_fusion = - input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0; + input.dtype() == DType::kBFloat16 && + rows % 64 == 0 && + cols % 128 == 0 && + transformer_engine::cuda::sm_arch() >= 100 && + transformer_engine::cuda::sm_arch() <= 110; // Stochastic rounding // When both rowwise and columnwise quantization are used with RHT, From d1848a841241507f8b36188742bd7f023c1d0070 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 00:09:59 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/pytorch/csrc/quantizer.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp index bd5e2695d1..ec464b79bc 100644 --- a/transformer_engine/pytorch/csrc/quantizer.cpp +++ b/transformer_engine/pytorch/csrc/quantizer.cpp @@ -2243,11 +2243,8 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT bool eligible_for_rht_cast_fusion = - input.dtype() == DType::kBFloat16 && - rows % 64 == 0 && - cols % 128 == 0 && - transformer_engine::cuda::sm_arch() >= 100 && - transformer_engine::cuda::sm_arch() <= 110; + input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0 && + transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() <= 110; // Stochastic rounding // When both rowwise and columnwise quantization are used with RHT, From 1425a8d9844a688273fb7467e86a98e3ac8de8f1 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 11 May 2026 16:40:29 -0700 Subject: [PATCH 3/4] Fix the compilation error Signed-off-by: Przemek Tredak --- transformer_engine/pytorch/csrc/quantizer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp index ec464b79bc..b2334283cd 100644 --- a/transformer_engine/pytorch/csrc/quantizer.cpp +++ b/transformer_engine/pytorch/csrc/quantizer.cpp @@ -8,6 +8,7 @@ #include "common.h" #include "common/util/system.h" +#include "common/util/cuda_runtime.h" #include "pybind.h" #include "torch/torch.h" From 44ca5ea0cdf6c6c53308a80c455a597b37f4ff6b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 May 2026 23:41:27 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/pytorch/csrc/quantizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp index b2334283cd..75e28c64ac 100644 --- a/transformer_engine/pytorch/csrc/quantizer.cpp +++ b/transformer_engine/pytorch/csrc/quantizer.cpp @@ -7,8 +7,8 @@ #include #include "common.h" -#include "common/util/system.h" #include "common/util/cuda_runtime.h" +#include "common/util/system.h" #include "pybind.h" #include "torch/torch.h"