From d3387fb952dcdaf09c2978552bf0119a9d71ecfb Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Tue, 28 Apr 2026 08:45:32 -0700
Subject: [PATCH 1/2] Add C++ unit tests for cadence::quantized_conv1d_ncl.out
 and quantized_conv1d_nlc.out (#19161)

Summary:

Adds C++ unit tests for cadence::quantized_conv1d_ncl.out and cadence::quantized_conv1d_nlc.out in test_op_quantized_conv1d_ncl.cpp. Also modifies op_quantized_conv1d_ncl.cpp and op_quantized_conv1d_nlc.cpp in the HiFi backend to fix correctness issues surfaced by the new tests.

Reviewed By: mcremon-meta, zonglinpeng

Differential Revision: D97886683
---
 .../hifi/operators/op_quantized_conv1d_ncl.cpp        | 11 +++++++----
 .../hifi/operators/op_quantized_conv1d_nlc.cpp        |  6 +++---
 2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
index ccc81a35aba..a0bed1e0b70 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
@@ -240,7 +240,10 @@ void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
   WORD32 x_stride = stride[0];
   WORD32 x_padding = padding[0];
   WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  const float eff_scale = bias_scale * (1.0f / output_scale);
+  WORD32 out_multiplier32 = (eff_scale >= 1.0f)
+      ? static_cast<WORD32>(2147483647)
+      : static_cast<WORD32>(eff_scale * 2147483648.0f);
   WORD32 out_shift32 = 0;
   WORD32 kernel_zero_bias = -weight_zero_point;
 
@@ -419,9 +422,9 @@ void quantized_conv1d_ncl_per_tensor_out(
           out);
     }
   } else if (dtype == ScalarType::Byte) {
-    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1).
-    // Fall back to generic implementation.
-    if (groups > 1) {
+    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1)
+    // or stride > 1. Fall back to generic implementation.
+    if (groups > 1 || stride[0] > 1) {
       impl::generic::native::quantized_conv1d_ncl_per_tensor_out(
           ctx,
           input,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index 2a11dbf358d..10c00bf536b 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -298,9 +298,9 @@ void quantized_conv1d_nlc_per_tensor_out(
           out);
     }
   } else if (dtype == ScalarType::Byte) {
-    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1).
-    // Fall back to generic implementation.
-    if (groups > 1) {
+    // HiFi nnlib conv1d_std kernel does not support depthwise (groups > 1)
+    // or stride > 1. Fall back to generic implementation.
+    if (groups > 1 || stride[0] > 1) {
       impl::generic::native::quantized_conv1d_nlc_per_tensor_out(
           ctx,
           input,

From 3f3d68405ea65d40bf8bc94f78ba164e131675e1 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Tue, 28 Apr 2026 08:45:32 -0700
Subject: [PATCH 2/2] Add C++ unit tests for cadence::fully_connected.out
 (#19165)

Summary:

Adds C++ unit tests for cadence::fully_connected.out in test_op_fully_connected.cpp. Tests verify correct dense layer output (y = Wx + b) across various input sizes, output feature counts, and with and without bias.

Reviewed By: zonglinpeng, mcremon-meta

Differential Revision: D97889888
---
 backends/cadence/generic/operators/op_fully_connected.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/generic/operators/op_fully_connected.cpp b/backends/cadence/generic/operators/op_fully_connected.cpp
index f1e53ad5f76..36befc52102 100644
--- a/backends/cadence/generic/operators/op_fully_connected.cpp
+++ b/backends/cadence/generic/operators/op_fully_connected.cpp
@@ -27,7 +27,8 @@ void linear(
     Tensor& output) {
   const float* __restrict__ input_data = input.const_data_ptr<float>();
   const float* __restrict__ weight_data = weight.const_data_ptr<float>();
-  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
+  const float* __restrict__ bias_data =
+      bias.has_value() ? bias.value().const_data_ptr<float>() : nullptr;
   float* __restrict__ output_data = output.mutable_data_ptr<float>();
 
   // input comes in shape [batch_size, in_dim]
@@ -43,7 +44,7 @@ void linear(
 
   for (int i = 0; i < leading_dims; ++i) {
     for (int j = 0; j < M; ++j) {
-      float sum = bias_data[j];
+      float sum = bias_data != nullptr ? bias_data[j] : 0.0f;
       for (int k = 0; k < N; ++k) {
         sum += input_data[i * N + k] * weight_data[j * N + k];
       }