Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
ad941b8
Add design doc for SQ8↔FP16 SIMD x86 kernels [MOD-14954]
dor-forer May 26, 2026
97467b2
Append -mf16c to AVX2_FMA/AVX2/SSE4 dispatcher sources [MOD-14954]
dor-forer May 26, 2026
bab7473
Add SQ8_FP16_SpacesOptimizationTest skeleton [MOD-14954]
dor-forer May 26, 2026
671a7cc
Add AVX-512 SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
c2f8340
Add AVX2+FMA SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
415c2ed
Add AVX2 (no FMA) SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
25c5a96
Add SSE4+F16C SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
4b7f3eb
Update SQ8_FP16 dispatcher assertions to walk SIMD tiers [MOD-14954]
dor-forer May 26, 2026
e21cb3b
Register per-ISA SQ8↔FP16 microbenchmarks [MOD-14954]
dor-forer May 26, 2026
4c8828e
Reformat SQ8↔FP16 SIMD kernels for consistent line breaks
dor-forer May 26, 2026
fdc5c1c
Address PR review findings for SQ8↔FP16 x86 kernels [MOD-14954]
dor-forer May 28, 2026
ce16f6b
Add multi-accumulator ILP to SQ8↔FP16 x86 kernels [MOD-14954]
dor-forer May 28, 2026
658c485
Drop misleading VNNI suffix from SQ8↔FP16 AVX-512 kernel [MOD-14954]
dor-forer May 28, 2026
fe69f85
Remove SQ8↔FP16 design doc from PR [MOD-14954]
dor-forer May 28, 2026
2a4ef92
Simplify SQ8↔FP16 tests to match sister conventions [MOD-14954]
dor-forer May 28, 2026
929f694
Split SQ8↔FP16 F16C kernels into sibling TUs [MOD-14954]
dor-forer May 28, 2026
b689840
Move SQ8↔FP16 AVX-512 dispatch to AVX512F tier + flatten F16C guards …
dor-forer May 28, 2026
839fe3c
Clean up whitespace and formatting inconsistencies
dor-forer May 28, 2026
3565985
Remove obsolete SQ8-to-FP16 dispatch comments
dor-forer May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions src/VecSim/spaces/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
endif()

# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
set(_has_full_f16c FALSE)
if(CXX_F16C AND CXX_FMA AND CXX_AVX)
set(_has_full_f16c TRUE)
endif()

# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
if(CXX_AVX2)
message("Building with AVX2")
set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
message("Building functions/AVX2.cpp with AVX2")
set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
endif()

if(CXX_AVX2 AND _has_full_f16c)
message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA)
message("Building with AVX2 and FMA")
message("Building functions/AVX2_FMA.cpp with AVX2 and FMA")
set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
endif()

if(CXX_F16C AND CXX_FMA AND CXX_AVX)
message("Building with CXX_F16C")
set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
Expand All @@ -81,11 +103,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
endif()

if(CXX_SSE4)
message("Building with SSE4")
set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
message("Building functions/SSE4.cpp with SSE4.1")
set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS "-msse4.1")
list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
endif()

# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
# (mirrors the F16C.cpp recipe above).
if(CXX_SSE4 AND _has_full_f16c)
message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
endif()

if(CXX_SSE)
message("Building with SSE")
set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
Expand Down
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
*/

// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
const float16 *&pVect2, __m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
}

// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
// under-read.
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two independent accumulators break the FMA dependency chain so consecutive iterations
// can issue in parallel through both FMA ports.
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

// Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
// FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
// nothing to the dot product. SQ8 load is intentionally unmasked.
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

// FP16 side: load full 16-byte block (safe — dim >= 16 and metadata follows).
__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
// Route the half-residual chunk to the second accumulator for ILP.
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}
112 changes: 112 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps
* (no FMA tier — Haswell-era AVX2 without FMA support).
*/

// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
__m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
}

// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
// under-read.
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two independent accumulators break the mul→add dependency chain on Haswell-class CPUs
// without FMA, where the add cannot retire before the prior mul.
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

// Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
// FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
// nothing to the dot product. SQ8 load is intentionally unmasked.
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
// Route the half-residual chunk to the second accumulator for ILP.
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
}
Loading
Loading