NVIDIA · timmoon10 · May 12, 2026 · May 8, 2026 · May 8, 2026 · May 11, 2026
@@ -7,6 +7,7 @@
 #include <pybind.h>
 
 #include "common.h"
+#include "common/util/cuda_runtime.h"
 #include "common/util/system.h"
 #include "pybind.h"
 #include "torch/torch.h"
@@ -2243,7 +2244,8 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
 
   // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT
   bool eligible_for_rht_cast_fusion =
-      input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0;
+      input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0 &&
+      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() <= 110;
-      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() <= 110;
+      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() < 120;
-      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() <= 110;
+      transformer_engine::cuda::sm_arch() >= 100 && transformer_engine::cuda::sm_arch() < 120;
 
   // Stochastic rounding
   // When both rowwise and columnwise quantization are used with RHT,