xtensor-stack · Alex-PLACET · Apr 30, 2026 · May 5, 2026 · May 5, 2026 · May 12, 2026
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -71,3 +71,43 @@ jobs:
       timeout-minutes: 10 # Consider increasing timeout
       working-directory: build/benchmark
       run: ./benchmark_xtensor
+
+  numpy:
+    runs-on: ubuntu-24.04
+    name: gcc 14 - numpy-xsimd
+    steps:
+    - name: Install GCC
+      uses: egor-tensin/setup-gcc@v1
+      with:
+        version: '14'
+        platform: x64
+
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set conda environment
+      uses: mamba-org/setup-micromamba@main
+      with:
+        environment-name: myenv
+        environment-file: environment-dev.yml
+        init-shell: bash
+        cache-downloads: true
+
+    - name: Configure using CMake
+      run: |
+        export CC=gcc-14
+        export CXX=g++-14
+        cmake -G Ninja -Bbuild -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_BENCHMARK=ON -DBUILD_NUMPY_BENCHMARKS=ON -DXTENSOR_USE_XSIMD=ON
+
+    - name: Build NumPy from source
+      working-directory: build
+      run: cmake --build . --target xbenchmark_numpy_env
+
+    - name: Build benchmark target
+      working-directory: build
+      run: cmake --build . --target benchmark_xtensor --parallel 8
+
+    - name: Run NumPy comparison benchmark
+      timeout-minutes: 20
+      working-directory: build/benchmark
+      run: ./benchmark_xtensor --benchmark_filter='(add_|multiply_|sin_|exp_|sum_axis[01]_)'
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -215,6 +215,7 @@ OPTION(XTENSOR_CHECK_DIMENSION "xtensor dimension check" OFF)
 OPTION(XTENSOR_FORCE_TEMPORARY_MEMORY_IN_ASSIGNMENTS "xtensor force the use of temporary memory when assigning instead of an automatic overlap check" ON)
 OPTION(BUILD_TESTS "xtensor test suite" OFF)
 OPTION(BUILD_BENCHMARK "xtensor benchmark" OFF)
+OPTION(BUILD_NUMPY_BENCHMARKS "xtensor benchmark comparisons against NumPy" OFF)
 OPTION(DOWNLOAD_GBENCHMARK "download google benchmark and build from source" ON)
 OPTION(DEFAULT_COLUMN_MAJOR "set default layout to column major" OFF)
 OPTION(CPP23 "enables C++23 (experimental)" OFF)
@@ -247,6 +248,10 @@ if(BUILD_TESTS)
     add_subdirectory(test)
 endif()
 
+if(BUILD_NUMPY_BENCHMARKS AND NOT BUILD_BENCHMARK)
+    message(FATAL_ERROR "BUILD_NUMPY_BENCHMARKS requires BUILD_BENCHMARK=ON")
+endif()
+
 if(BUILD_BENCHMARK)
     add_subdirectory(benchmark)
 endif()

diff --git a/README.md b/README.md
@@ -58,6 +58,58 @@ cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix
 make install
 ```
 
+### Benchmarking xtensor against NumPy
+
+The benchmark target can optionally compare xtensor and NumPy on matching
+elementwise operations and reductions. The current comparison set includes
+elementwise math functions together with reducers such as `sum`, `mean`,
+`amin`, `amax`, and `prod`. The NumPy comparison path embeds Python and calls
+the public NumPy API from the same benchmark executable.
+
+To keep the comparison fair, build the benchmarks in release mode and rebuild
+NumPy from source with the same optimization policy used by the xtensor
+benchmark target:
+
+```bash
+cmake -G Ninja -Bbuild -DBUILD_BENCHMARK=ON -DBUILD_NUMPY_BENCHMARKS=ON -DXTENSOR_USE_XSIMD=ON
+cmake --build build --target xbenchmark_numpy_env
+cmake --build build --target benchmark_xtensor
+./build/benchmark/benchmark_xtensor --benchmark_filter='(add_|multiply_|sin_|exp_|sum_|mean_|amin_|amax_|prod_)'
+```
+
+`xbenchmark_numpy_env` reinstalls NumPy from source with
+`XTENSOR_NUMPY_BENCHMARK_CFLAGS`, which defaults to `-O3` and adds
+`-march=native` when supported by the current compiler. It also exports
+`CC` and `CXX` from the active CMake configuration so NumPy is built with the
+same compiler toolchain as the xtensor benchmark target. The install uses
+`--no-cache-dir --no-binary=numpy` so the target recompiles NumPy instead of
+reusing a cached wheel. The requested BLAS and LAPACK backends are controlled
+with `XTENSOR_NUMPY_BENCHMARK_BLAS` and `XTENSOR_NUMPY_BENCHMARK_LAPACK`,
+which default to `openblas`, and the benchmark startup banner reports the
+backend and compiler arguments NumPy actually used.
+
+To generate a Markdown comparison report from the benchmark output, use:
+
+```bash
+python tools/report_numpy_benchmarks.py \
+  --benchmark-exe build/benchmark/benchmark_xtensor \
+  --benchmark-filter='(add_|multiply_|sin_|exp_|sum_|mean_|amin_|amax_|prod_)' \
+  --output build/xtensor_numpy_report.md
+```
+
+The script can also analyze an existing Google Benchmark JSON file via
+`--input-json`.
+
+For a one-command workflow from CMake, use:
+
+```bash
+cmake --build build --target xbenchmark_numpy_report
+```
+
+The report target rebuilds NumPy from source through `xbenchmark_numpy_env`,
+runs the benchmark executable, and writes the Markdown report to
+`XTENSOR_NUMPY_BENCHMARK_REPORT`.
+
 ### Installing xtensor using vcpkg
 
 You can download and install xtensor using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -21,13 +21,18 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
 
 include(CheckCXXCompilerFlag)
 
+set(_xtensor_numpy_benchmark_cflags "-O3")
+
 string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE)
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
     CHECK_CXX_COMPILER_FLAG(-march=native arch_native_supported)
     if(arch_native_supported AND NOT CMAKE_CXX_FLAGS MATCHES "-march")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
     endif()
+    if(arch_native_supported)
+        string(APPEND _xtensor_numpy_benchmark_cflags " -march=native")
+    endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -Wunused-parameter -Wextra -Wreorder")
 
     if(NOT "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC")
@@ -63,6 +68,13 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
     endif()
 endif()
 
+set(XTENSOR_NUMPY_BENCHMARK_CFLAGS "${_xtensor_numpy_benchmark_cflags}" CACHE STRING "CFLAGS used to build NumPy from source for comparison benchmarks")
+set(XTENSOR_NUMPY_BENCHMARK_BLAS "openblas" CACHE STRING "BLAS backend requested when building NumPy from source for comparison benchmarks")
+set(XTENSOR_NUMPY_BENCHMARK_LAPACK "openblas" CACHE STRING "LAPACK backend requested when building NumPy from source for comparison benchmarks")
+set(XTENSOR_NUMPY_BENCHMARK_REPORT "${CMAKE_BINARY_DIR}/xtensor_numpy_report.md" CACHE FILEPATH "Output path for the xtensor vs NumPy benchmark report")
+set(XTENSOR_NUMPY_BENCHMARK_REPORT_FILTER ".*_(xtensor|numpy)/.*" CACHE STRING "Google Benchmark filter used when generating the xtensor vs NumPy report")
+set(XTENSOR_NUMPY_BENCHMARK_REPORT_MIN_TIME "0.05s" CACHE STRING "Minimum benchmark runtime used when generating the xtensor vs NumPy report")
+
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj")
     set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO)
@@ -120,11 +132,45 @@ set(XTENSOR_BENCHMARK
     main.cpp
 )
 
+if(BUILD_NUMPY_BENCHMARKS)
+    list(APPEND XTENSOR_BENCHMARK benchmark_numpy.cpp)
+    find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Embed)
+endif()
+
 
 set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor)
 add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS})
 target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC xtensor ${GBENCHMARK_LIBRARIES})
 
+if(BUILD_NUMPY_BENCHMARKS)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_ENABLE_NUMPY_BENCHMARKS=1)
+    target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC Python3::Python)
+
+    add_custom_target(xbenchmark_numpy_env
+        COMMAND ${CMAKE_COMMAND} -E env
+            "CC=${CMAKE_C_COMPILER}"
+            "CXX=${CMAKE_CXX_COMPILER}"
+            "CFLAGS=${XTENSOR_NUMPY_BENCHMARK_CFLAGS}"
+            "CXXFLAGS=${XTENSOR_NUMPY_BENCHMARK_CFLAGS}"
+            ${Python3_EXECUTABLE} -m pip install --upgrade --force-reinstall --no-cache-dir --no-binary=numpy
+            --config-settings=setup-args=-Dblas=${XTENSOR_NUMPY_BENCHMARK_BLAS}
+            --config-settings=setup-args=-Dlapack=${XTENSOR_NUMPY_BENCHMARK_LAPACK}
+            numpy
+        USES_TERMINAL
+        COMMENT "Installing NumPy from source with benchmark flags: ${XTENSOR_NUMPY_BENCHMARK_CFLAGS}, BLAS=${XTENSOR_NUMPY_BENCHMARK_BLAS}, LAPACK=${XTENSOR_NUMPY_BENCHMARK_LAPACK}")
+
+    add_custom_target(xbenchmark_numpy_report
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../tools/report_numpy_benchmarks.py
+            --benchmark-exe $<TARGET_FILE:${XTENSOR_BENCHMARK_TARGET}>
+            --benchmark-filter=${XTENSOR_NUMPY_BENCHMARK_REPORT_FILTER}
+            --benchmark-min-time=${XTENSOR_NUMPY_BENCHMARK_REPORT_MIN_TIME}
+            --output ${XTENSOR_NUMPY_BENCHMARK_REPORT}
+        DEPENDS xbenchmark_numpy_env ${XTENSOR_BENCHMARK_TARGET}
+        USES_TERMINAL
+        VERBATIM
+        COMMENT "Generating xtensor vs NumPy benchmark report at ${XTENSOR_NUMPY_BENCHMARK_REPORT}")
+endif()
+
 if(XTENSOR_USE_TBB)
     target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_TBB)
     target_include_directories(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_INCLUDE_DIRS})