RedisAI · dor-forer · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
@@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
 	endif()
 
+	# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
+	# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
+	set(_has_full_f16c FALSE)
+	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
+		set(_has_full_f16c TRUE)
+	endif()
+
+	# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
+	# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
+	# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
 	if(CXX_AVX2)
-		message("Building with AVX2")
-		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+		message("Building functions/AVX2.cpp with AVX2")
+		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND _has_full_f16c)
+		message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
+		set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
+	endif()
+
 	if(CXX_AVX2 AND CXX_FMA)
-		message("Building with AVX2 and FMA")
+		message("Building functions/AVX2_FMA.cpp with AVX2 and FMA")
 		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
+		message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
+		set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
@@ -81,11 +103,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	endif()
 
 	if(CXX_SSE4)
-		message("Building with SSE4")
-		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
+		message("Building functions/SSE4.cpp with SSE4.1")
+		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS "-msse4.1")
 		list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
 	endif()
 
+	# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
+	# (mirrors the F16C.cpp recipe above).
+	if(CXX_SSE4 AND _has_full_f16c)
+		message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
+		set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
+		list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
+	endif()
+
 	if(CXX_SSE)
 		message("Building with SSE")
 		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)

diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
+ */
+
+// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
+static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
+                                                      const float16 *&pVect2, __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
+}
+
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
+// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
+// under-read.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two independent accumulators break the FMA dependency chain so consecutive iterations
+    // can issue in parallel through both FMA ports.
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
+        // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
+        // nothing to the dot product. SQ8 load is intentionally unmasked.
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        // FP16 side: load full 16-byte block (safe — dim >= 16 and metadata follows).
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        // Route the half-residual chunk to the second accumulator for ILP.
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
+                                           size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps
+ * (no FMA tier — Haswell-era AVX2 without FMA support).
+ */
+
+// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
+static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                  __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
+}
+
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
+// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
+// under-read.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two independent accumulators break the mul→add dependency chain on Haswell-class CPUs
+    // without FMA, where the add cannot retire before the prior mul.
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
+        // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
+        // nothing to the dot product. SQ8 load is intentionally unmasked.
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        // Route the half-residual chunk to the second accumulator for ILP.
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
+}